gfx/thebes/gfxAlphaRecoverySSE2.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/thebes/gfxAlphaRecoverySSE2.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,236 @@
     1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     1.5 + * This Source Code Form is subject to the terms of the Mozilla Public
     1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this
     1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     1.8 +
     1.9 +#include "gfxAlphaRecovery.h"
    1.10 +#include "gfxImageSurface.h"
    1.11 +#include "nsRect.h"
    1.12 +#include <emmintrin.h>
    1.13 +
    1.14 +// This file should only be compiled on x86 and x64 systems.  Additionally,
    1.15 +// you'll need to compile it with -msse2 if you're using GCC on x86.
    1.16 +
    1.17 +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
    1.18 +__declspec(align(16)) static uint32_t greenMaski[] =
    1.19 +    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
    1.20 +__declspec(align(16)) static uint32_t alphaMaski[] =
    1.21 +    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
    1.22 +#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    1.23 +static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
    1.24 +    { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
    1.25 +static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
    1.26 +    { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
    1.27 +#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
    1.28 +#pragma align 16 (greenMaski, alphaMaski)
    1.29 +static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
    1.30 +static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
    1.31 +#endif
    1.32 +
    1.33 +bool
    1.34 +gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
    1.35 +                                   const gfxImageSurface* whiteSurf)
    1.36 +{
    1.37 +    gfxIntSize size = blackSurf->GetSize();
    1.38 +
    1.39 +    if (size != whiteSurf->GetSize() ||
    1.40 +        (blackSurf->Format() != gfxImageFormat::ARGB32 &&
    1.41 +         blackSurf->Format() != gfxImageFormat::RGB24) ||
    1.42 +        (whiteSurf->Format() != gfxImageFormat::ARGB32 &&
    1.43 +         whiteSurf->Format() != gfxImageFormat::RGB24))
    1.44 +        return false;
    1.45 +
    1.46 +    blackSurf->Flush();
    1.47 +    whiteSurf->Flush();
    1.48 +
    1.49 +    unsigned char* blackData = blackSurf->Data();
    1.50 +    unsigned char* whiteData = whiteSurf->Data();
    1.51 +
    1.52 +    if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
    1.53 +        (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
    1.54 +        // Cannot keep these in alignment.
    1.55 +        return false;
    1.56 +    }
    1.57 +
    1.58 +    __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
    1.59 +    __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
    1.60 +
    1.61 +    for (int32_t i = 0; i < size.height; ++i) {
    1.62 +        int32_t j = 0;
    1.63 +        // Loop single pixels until at 4 byte alignment.
    1.64 +        while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
    1.65 +            *((uint32_t*)blackData) =
    1.66 +                RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
    1.67 +                             *reinterpret_cast<uint32_t*>(whiteData));
    1.68 +            blackData += 4;
    1.69 +            whiteData += 4;
    1.70 +            j++;
    1.71 +        }
    1.72 +        // This extra loop allows the compiler to do some more clever registry
    1.73 +        // management and makes it about 5% faster than with only the 4 pixel
    1.74 +        // at a time loop.
    1.75 +        for (; j < size.width - 8; j += 8) {
    1.76 +            __m128i black1 = _mm_load_si128((__m128i*)blackData);
    1.77 +            __m128i white1 = _mm_load_si128((__m128i*)whiteData);
    1.78 +            __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
    1.79 +            __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
    1.80 +
    1.81 +            // Execute the same instructions as described in RecoverPixel, only
    1.82 +            // using an SSE2 packed saturated subtract.
    1.83 +            white1 = _mm_subs_epu8(white1, black1);
    1.84 +            white2 = _mm_subs_epu8(white2, black2);
    1.85 +            white1 = _mm_subs_epu8(greenMask, white1);
    1.86 +            white2 = _mm_subs_epu8(greenMask, white2);
    1.87 +            // Producing the final black pixel in an XMM register and storing
    1.88 +            // that is actually faster than doing a masked store since that
    1.89 +            // does an unaligned storage. We have the black pixel in a register
    1.90 +            // anyway.
    1.91 +            black1 = _mm_andnot_si128(alphaMask, black1);
    1.92 +            black2 = _mm_andnot_si128(alphaMask, black2);
    1.93 +            white1 = _mm_slli_si128(white1, 2);
    1.94 +            white2 = _mm_slli_si128(white2, 2);
    1.95 +            white1 = _mm_and_si128(alphaMask, white1);
    1.96 +            white2 = _mm_and_si128(alphaMask, white2);
    1.97 +            black1 = _mm_or_si128(white1, black1);
    1.98 +            black2 = _mm_or_si128(white2, black2);
    1.99 +
   1.100 +            _mm_store_si128((__m128i*)blackData, black1);
   1.101 +            _mm_store_si128((__m128i*)(blackData + 16), black2);
   1.102 +            blackData += 32;
   1.103 +            whiteData += 32;
   1.104 +        }
   1.105 +        for (; j < size.width - 4; j += 4) {
   1.106 +            __m128i black = _mm_load_si128((__m128i*)blackData);
   1.107 +            __m128i white = _mm_load_si128((__m128i*)whiteData);
   1.108 +
   1.109 +            white = _mm_subs_epu8(white, black);
   1.110 +            white = _mm_subs_epu8(greenMask, white);
   1.111 +            black = _mm_andnot_si128(alphaMask, black);
   1.112 +            white = _mm_slli_si128(white, 2);
   1.113 +            white = _mm_and_si128(alphaMask, white);
   1.114 +            black = _mm_or_si128(white, black);
   1.115 +            _mm_store_si128((__m128i*)blackData, black);
   1.116 +            blackData += 16;
   1.117 +            whiteData += 16;
   1.118 +        }
   1.119 +        // Loop single pixels until we're done.
   1.120 +        while (j < size.width) {
   1.121 +            *((uint32_t*)blackData) =
   1.122 +                RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
   1.123 +                             *reinterpret_cast<uint32_t*>(whiteData));
   1.124 +            blackData += 4;
   1.125 +            whiteData += 4;
   1.126 +            j++;
   1.127 +        }
   1.128 +        blackData += blackSurf->Stride() - j * 4;
   1.129 +        whiteData += whiteSurf->Stride() - j * 4;
   1.130 +    }
   1.131 +
   1.132 +    blackSurf->MarkDirty();
   1.133 +
   1.134 +    return true;
   1.135 +}
   1.136 +
   1.137 +static int32_t
   1.138 +ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
   1.139 +{
   1.140 +    return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
   1.141 +}
   1.142 +
   1.143 +/*static*/ nsIntRect
   1.144 +gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect,
   1.145 +                                               gfxImageSurface* aSurface)
   1.146 +{
   1.147 +    NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(),
   1.148 +                 "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
   1.149 +    static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
   1.150 +    static const int32_t bpp = 4;
   1.151 +    static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
   1.152 +    //
   1.153 +    // We're going to create a subimage of the surface with size
   1.154 +    // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
   1.155 +    // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
   1.156 +    // properly aligned for SIMD.  So we want to find a rect <x',y',
   1.157 +    // w',h'> that's a superset of what needs to be redrawn but is
   1.158 +    // properly aligned.  Proper alignment is
   1.159 +    //
   1.160 +    //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
   1.161 +    //   BPP * w'             \cong BPP * sw  (mod ALIGN)
   1.162 +    //
   1.163 +    // (We assume the pixel at surface <0,0> is already ALIGN'd.)
   1.164 +    // That rect (obviously) has to fit within the surface bounds, and
   1.165 +    // we should also minimize the extra pixels redrawn only for
   1.166 +    // alignment's sake.  So we also want
   1.167 +    //
   1.168 +    //  minimize <x',y', w',h'>
   1.169 +    //   0 <= x' <= x
   1.170 +    //   0 <= y' <= y
   1.171 +    //   w <= w' <= sw
   1.172 +    //   h <= h' <= sh
   1.173 +    //
   1.174 +    // This is a messy integer non-linear programming problem, except
   1.175 +    // ... we can assume that ALIGN/BPP is a very small constant.  So,
   1.176 +    // brute force is viable.  The algorithm below will find a
   1.177 +    // solution if one exists, but isn't guaranteed to find the
   1.178 +    // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
   1.179 +    // most 64 iterations below).  In what's likely the common case,
   1.180 +    // an already-aligned rectangle, it only needs 1 iteration.
   1.181 +    //
   1.182 +    // Is this alignment worth doing?  Recovering alpha will take work
   1.183 +    // proportional to w*h (assuming alpha recovery computation isn't
   1.184 +    // memory bound).  This analysis can lead to O(w+h) extra work
   1.185 +    // (with small constants).  In exchange, we expect to shave off a
   1.186 +    // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
   1.187 +    // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
   1.188 +    // only really care about the w*h >> w+h case anyway; others
   1.189 +    // should be fast enough even with the overhead.  (Unless the cost
   1.190 +    // of repainting the expanded rect is high, but in that case
   1.191 +    // SIMD-ized alpha recovery won't make a difference so this code
   1.192 +    // shouldn't be called.)
   1.193 +    //
   1.194 +    gfxIntSize surfaceSize = aSurface->GetSize();
   1.195 +    const int32_t stride = bpp * surfaceSize.width;
   1.196 +    if (stride != aSurface->Stride()) {
   1.197 +        NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
   1.198 +        return aRect;
   1.199 +    }
   1.200 +
   1.201 +    const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
   1.202 +    const int32_t r = x + w;
   1.203 +    const int32_t sw = surfaceSize.width;
   1.204 +    const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
   1.205 +
   1.206 +    // The outer two loops below keep the rightmost (|r| above) and
   1.207 +    // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
   1.208 +    // return only a superset of the original rect.  These loops
   1.209 +    // search for an aligned top-left pixel by trying to expand <x,y>
   1.210 +    // left and up by <dx,dy> pixels, respectively.
   1.211 +    //
   1.212 +    // Then if a properly-aligned top-left pixel is found, the
   1.213 +    // innermost loop tries to find an aligned stride by moving the
   1.214 +    // rightmost pixel rightward by dr.
   1.215 +    int32_t dx, dy, dr;
   1.216 +    for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
   1.217 +        for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
   1.218 +            if (0 != ByteAlignment(kByteAlignLog2,
   1.219 +                                   bpp * (x - dx), y - dy, stride)) {
   1.220 +                continue;
   1.221 +            }
   1.222 +            for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
   1.223 +                if (strideAlign == ByteAlignment(kByteAlignLog2,
   1.224 +                                                 bpp * (w + dr + dx))) {
   1.225 +                    goto FOUND_SOLUTION;
   1.226 +                }
   1.227 +            }
   1.228 +        }
   1.229 +    }
   1.230 +
   1.231 +    // Didn't find a solution.
   1.232 +    return aRect;
   1.233 +
   1.234 +FOUND_SOLUTION:
   1.235 +    nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy);
   1.236 +    NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution),
   1.237 +                      "'Solution' extends outside surface bounds!");
   1.238 +    return solution;
   1.239 +}

mercurial