1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/thebes/gfxAlphaRecoverySSE2.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,236 @@ 1.4 +/* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- 1.5 + * This Source Code Form is subject to the terms of the Mozilla Public 1.6 + * License, v. 2.0. If a copy of the MPL was not distributed with this 1.7 + * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ 1.8 + 1.9 +#include "gfxAlphaRecovery.h" 1.10 +#include "gfxImageSurface.h" 1.11 +#include "nsRect.h" 1.12 +#include <emmintrin.h> 1.13 + 1.14 +// This file should only be compiled on x86 and x64 systems. Additionally, 1.15 +// you'll need to compile it with -msse2 if you're using GCC on x86. 1.16 + 1.17 +#if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) 1.18 +__declspec(align(16)) static uint32_t greenMaski[] = 1.19 + { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; 1.20 +__declspec(align(16)) static uint32_t alphaMaski[] = 1.21 + { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; 1.22 +#elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) 1.23 +static uint32_t greenMaski[] __attribute__ ((aligned (16))) = 1.24 + { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; 1.25 +static uint32_t alphaMaski[] __attribute__ ((aligned (16))) = 1.26 + { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; 1.27 +#elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) 1.28 +#pragma align 16 (greenMaski, alphaMaski) 1.29 +static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; 1.30 +static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; 1.31 +#endif 1.32 + 1.33 +bool 1.34 +gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, 1.35 + const gfxImageSurface* whiteSurf) 1.36 +{ 1.37 + gfxIntSize size = blackSurf->GetSize(); 1.38 + 1.39 + if (size != whiteSurf->GetSize() || 1.40 + (blackSurf->Format() != gfxImageFormat::ARGB32 && 1.41 + blackSurf->Format() != gfxImageFormat::RGB24) || 1.42 + (whiteSurf->Format() != gfxImageFormat::ARGB32 && 1.43 + whiteSurf->Format() != gfxImageFormat::RGB24)) 1.44 + return false; 1.45 + 1.46 + blackSurf->Flush(); 1.47 + whiteSurf->Flush(); 1.48 + 1.49 + unsigned char* blackData = blackSurf->Data(); 1.50 + unsigned char* whiteData = whiteSurf->Data(); 1.51 + 1.52 + if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || 1.53 + (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { 1.54 + // Cannot keep these in alignment. 1.55 + return false; 1.56 + } 1.57 + 1.58 + __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); 1.59 + __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); 1.60 + 1.61 + for (int32_t i = 0; i < size.height; ++i) { 1.62 + int32_t j = 0; 1.63 + // Loop single pixels until at 4 byte alignment. 1.64 + while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { 1.65 + *((uint32_t*)blackData) = 1.66 + RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), 1.67 + *reinterpret_cast<uint32_t*>(whiteData)); 1.68 + blackData += 4; 1.69 + whiteData += 4; 1.70 + j++; 1.71 + } 1.72 + // This extra loop allows the compiler to do some more clever registry 1.73 + // management and makes it about 5% faster than with only the 4 pixel 1.74 + // at a time loop. 1.75 + for (; j < size.width - 8; j += 8) { 1.76 + __m128i black1 = _mm_load_si128((__m128i*)blackData); 1.77 + __m128i white1 = _mm_load_si128((__m128i*)whiteData); 1.78 + __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); 1.79 + __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); 1.80 + 1.81 + // Execute the same instructions as described in RecoverPixel, only 1.82 + // using an SSE2 packed saturated subtract. 1.83 + white1 = _mm_subs_epu8(white1, black1); 1.84 + white2 = _mm_subs_epu8(white2, black2); 1.85 + white1 = _mm_subs_epu8(greenMask, white1); 1.86 + white2 = _mm_subs_epu8(greenMask, white2); 1.87 + // Producing the final black pixel in an XMM register and storing 1.88 + // that is actually faster than doing a masked store since that 1.89 + // does an unaligned storage. We have the black pixel in a register 1.90 + // anyway. 1.91 + black1 = _mm_andnot_si128(alphaMask, black1); 1.92 + black2 = _mm_andnot_si128(alphaMask, black2); 1.93 + white1 = _mm_slli_si128(white1, 2); 1.94 + white2 = _mm_slli_si128(white2, 2); 1.95 + white1 = _mm_and_si128(alphaMask, white1); 1.96 + white2 = _mm_and_si128(alphaMask, white2); 1.97 + black1 = _mm_or_si128(white1, black1); 1.98 + black2 = _mm_or_si128(white2, black2); 1.99 + 1.100 + _mm_store_si128((__m128i*)blackData, black1); 1.101 + _mm_store_si128((__m128i*)(blackData + 16), black2); 1.102 + blackData += 32; 1.103 + whiteData += 32; 1.104 + } 1.105 + for (; j < size.width - 4; j += 4) { 1.106 + __m128i black = _mm_load_si128((__m128i*)blackData); 1.107 + __m128i white = _mm_load_si128((__m128i*)whiteData); 1.108 + 1.109 + white = _mm_subs_epu8(white, black); 1.110 + white = _mm_subs_epu8(greenMask, white); 1.111 + black = _mm_andnot_si128(alphaMask, black); 1.112 + white = _mm_slli_si128(white, 2); 1.113 + white = _mm_and_si128(alphaMask, white); 1.114 + black = _mm_or_si128(white, black); 1.115 + _mm_store_si128((__m128i*)blackData, black); 1.116 + blackData += 16; 1.117 + whiteData += 16; 1.118 + } 1.119 + // Loop single pixels until we're done. 1.120 + while (j < size.width) { 1.121 + *((uint32_t*)blackData) = 1.122 + RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), 1.123 + *reinterpret_cast<uint32_t*>(whiteData)); 1.124 + blackData += 4; 1.125 + whiteData += 4; 1.126 + j++; 1.127 + } 1.128 + blackData += blackSurf->Stride() - j * 4; 1.129 + whiteData += whiteSurf->Stride() - j * 4; 1.130 + } 1.131 + 1.132 + blackSurf->MarkDirty(); 1.133 + 1.134 + return true; 1.135 +} 1.136 + 1.137 +static int32_t 1.138 +ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1) 1.139 +{ 1.140 + return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); 1.141 +} 1.142 + 1.143 +/*static*/ nsIntRect 1.144 +gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect, 1.145 + gfxImageSurface* aSurface) 1.146 +{ 1.147 + NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(), 1.148 + "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); 1.149 + static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); 1.150 + static const int32_t bpp = 4; 1.151 + static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; 1.152 + // 1.153 + // We're going to create a subimage of the surface with size 1.154 + // <sw,sh> for alpha recovery, and want a SIMD fast-path. The 1.155 + // rect <x,y, w,h> /needs/ to be redrawn, but it might not be 1.156 + // properly aligned for SIMD. So we want to find a rect <x',y', 1.157 + // w',h'> that's a superset of what needs to be redrawn but is 1.158 + // properly aligned. Proper alignment is 1.159 + // 1.160 + // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) 1.161 + // BPP * w' \cong BPP * sw (mod ALIGN) 1.162 + // 1.163 + // (We assume the pixel at surface <0,0> is already ALIGN'd.) 1.164 + // That rect (obviously) has to fit within the surface bounds, and 1.165 + // we should also minimize the extra pixels redrawn only for 1.166 + // alignment's sake. So we also want 1.167 + // 1.168 + // minimize <x',y', w',h'> 1.169 + // 0 <= x' <= x 1.170 + // 0 <= y' <= y 1.171 + // w <= w' <= sw 1.172 + // h <= h' <= sh 1.173 + // 1.174 + // This is a messy integer non-linear programming problem, except 1.175 + // ... we can assume that ALIGN/BPP is a very small constant. So, 1.176 + // brute force is viable. The algorithm below will find a 1.177 + // solution if one exists, but isn't guaranteed to find the 1.178 + // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at 1.179 + // most 64 iterations below). In what's likely the common case, 1.180 + // an already-aligned rectangle, it only needs 1 iteration. 1.181 + // 1.182 + // Is this alignment worth doing? Recovering alpha will take work 1.183 + // proportional to w*h (assuming alpha recovery computation isn't 1.184 + // memory bound). This analysis can lead to O(w+h) extra work 1.185 + // (with small constants). In exchange, we expect to shave off a 1.186 + // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as 1.187 + // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We 1.188 + // only really care about the w*h >> w+h case anyway; others 1.189 + // should be fast enough even with the overhead. (Unless the cost 1.190 + // of repainting the expanded rect is high, but in that case 1.191 + // SIMD-ized alpha recovery won't make a difference so this code 1.192 + // shouldn't be called.) 1.193 + // 1.194 + gfxIntSize surfaceSize = aSurface->GetSize(); 1.195 + const int32_t stride = bpp * surfaceSize.width; 1.196 + if (stride != aSurface->Stride()) { 1.197 + NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); 1.198 + return aRect; 1.199 + } 1.200 + 1.201 + const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height; 1.202 + const int32_t r = x + w; 1.203 + const int32_t sw = surfaceSize.width; 1.204 + const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); 1.205 + 1.206 + // The outer two loops below keep the rightmost (|r| above) and 1.207 + // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we 1.208 + // return only a superset of the original rect. These loops 1.209 + // search for an aligned top-left pixel by trying to expand <x,y> 1.210 + // left and up by <dx,dy> pixels, respectively. 1.211 + // 1.212 + // Then if a properly-aligned top-left pixel is found, the 1.213 + // innermost loop tries to find an aligned stride by moving the 1.214 + // rightmost pixel rightward by dr. 1.215 + int32_t dx, dy, dr; 1.216 + for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { 1.217 + for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { 1.218 + if (0 != ByteAlignment(kByteAlignLog2, 1.219 + bpp * (x - dx), y - dy, stride)) { 1.220 + continue; 1.221 + } 1.222 + for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { 1.223 + if (strideAlign == ByteAlignment(kByteAlignLog2, 1.224 + bpp * (w + dr + dx))) { 1.225 + goto FOUND_SOLUTION; 1.226 + } 1.227 + } 1.228 + } 1.229 + } 1.230 + 1.231 + // Didn't find a solution. 1.232 + return aRect; 1.233 + 1.234 +FOUND_SOLUTION: 1.235 + nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy); 1.236 + NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution), 1.237 + "'Solution' extends outside surface bounds!"); 1.238 + return solution; 1.239 +}