michael@0: /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- michael@0: * This Source Code Form is subject to the terms of the Mozilla Public michael@0: * License, v. 2.0. If a copy of the MPL was not distributed with this michael@0: * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ michael@0: michael@0: #include "gfxAlphaRecovery.h" michael@0: #include "gfxImageSurface.h" michael@0: #include "nsRect.h" michael@0: #include michael@0: michael@0: // This file should only be compiled on x86 and x64 systems. Additionally, michael@0: // you'll need to compile it with -msse2 if you're using GCC on x86. michael@0: michael@0: #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) michael@0: __declspec(align(16)) static uint32_t greenMaski[] = michael@0: { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; michael@0: __declspec(align(16)) static uint32_t alphaMaski[] = michael@0: { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; michael@0: #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) michael@0: static uint32_t greenMaski[] __attribute__ ((aligned (16))) = michael@0: { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; michael@0: static uint32_t alphaMaski[] __attribute__ ((aligned (16))) = michael@0: { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; michael@0: #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) michael@0: #pragma align 16 (greenMaski, alphaMaski) michael@0: static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; michael@0: static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; michael@0: #endif michael@0: michael@0: bool michael@0: gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, michael@0: const gfxImageSurface* whiteSurf) michael@0: { michael@0: gfxIntSize size = blackSurf->GetSize(); michael@0: michael@0: if (size != whiteSurf->GetSize() || michael@0: (blackSurf->Format() != gfxImageFormat::ARGB32 && michael@0: blackSurf->Format() != gfxImageFormat::RGB24) || michael@0: (whiteSurf->Format() != gfxImageFormat::ARGB32 && michael@0: whiteSurf->Format() != gfxImageFormat::RGB24)) michael@0: return false; michael@0: michael@0: blackSurf->Flush(); michael@0: whiteSurf->Flush(); michael@0: michael@0: unsigned char* blackData = blackSurf->Data(); michael@0: unsigned char* whiteData = whiteSurf->Data(); michael@0: michael@0: if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || michael@0: (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { michael@0: // Cannot keep these in alignment. michael@0: return false; michael@0: } michael@0: michael@0: __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); michael@0: __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); michael@0: michael@0: for (int32_t i = 0; i < size.height; ++i) { michael@0: int32_t j = 0; michael@0: // Loop single pixels until at 4 byte alignment. michael@0: while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { michael@0: *((uint32_t*)blackData) = michael@0: RecoverPixel(*reinterpret_cast(blackData), michael@0: *reinterpret_cast(whiteData)); michael@0: blackData += 4; michael@0: whiteData += 4; michael@0: j++; michael@0: } michael@0: // This extra loop allows the compiler to do some more clever registry michael@0: // management and makes it about 5% faster than with only the 4 pixel michael@0: // at a time loop. michael@0: for (; j < size.width - 8; j += 8) { michael@0: __m128i black1 = _mm_load_si128((__m128i*)blackData); michael@0: __m128i white1 = _mm_load_si128((__m128i*)whiteData); michael@0: __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); michael@0: __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); michael@0: michael@0: // Execute the same instructions as described in RecoverPixel, only michael@0: // using an SSE2 packed saturated subtract. michael@0: white1 = _mm_subs_epu8(white1, black1); michael@0: white2 = _mm_subs_epu8(white2, black2); michael@0: white1 = _mm_subs_epu8(greenMask, white1); michael@0: white2 = _mm_subs_epu8(greenMask, white2); michael@0: // Producing the final black pixel in an XMM register and storing michael@0: // that is actually faster than doing a masked store since that michael@0: // does an unaligned storage. We have the black pixel in a register michael@0: // anyway. michael@0: black1 = _mm_andnot_si128(alphaMask, black1); michael@0: black2 = _mm_andnot_si128(alphaMask, black2); michael@0: white1 = _mm_slli_si128(white1, 2); michael@0: white2 = _mm_slli_si128(white2, 2); michael@0: white1 = _mm_and_si128(alphaMask, white1); michael@0: white2 = _mm_and_si128(alphaMask, white2); michael@0: black1 = _mm_or_si128(white1, black1); michael@0: black2 = _mm_or_si128(white2, black2); michael@0: michael@0: _mm_store_si128((__m128i*)blackData, black1); michael@0: _mm_store_si128((__m128i*)(blackData + 16), black2); michael@0: blackData += 32; michael@0: whiteData += 32; michael@0: } michael@0: for (; j < size.width - 4; j += 4) { michael@0: __m128i black = _mm_load_si128((__m128i*)blackData); michael@0: __m128i white = _mm_load_si128((__m128i*)whiteData); michael@0: michael@0: white = _mm_subs_epu8(white, black); michael@0: white = _mm_subs_epu8(greenMask, white); michael@0: black = _mm_andnot_si128(alphaMask, black); michael@0: white = _mm_slli_si128(white, 2); michael@0: white = _mm_and_si128(alphaMask, white); michael@0: black = _mm_or_si128(white, black); michael@0: _mm_store_si128((__m128i*)blackData, black); michael@0: blackData += 16; michael@0: whiteData += 16; michael@0: } michael@0: // Loop single pixels until we're done. michael@0: while (j < size.width) { michael@0: *((uint32_t*)blackData) = michael@0: RecoverPixel(*reinterpret_cast(blackData), michael@0: *reinterpret_cast(whiteData)); michael@0: blackData += 4; michael@0: whiteData += 4; michael@0: j++; michael@0: } michael@0: blackData += blackSurf->Stride() - j * 4; michael@0: whiteData += whiteSurf->Stride() - j * 4; michael@0: } michael@0: michael@0: blackSurf->MarkDirty(); michael@0: michael@0: return true; michael@0: } michael@0: michael@0: static int32_t michael@0: ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1) michael@0: { michael@0: return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); michael@0: } michael@0: michael@0: /*static*/ nsIntRect michael@0: gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect, michael@0: gfxImageSurface* aSurface) michael@0: { michael@0: NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(), michael@0: "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); michael@0: static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); michael@0: static const int32_t bpp = 4; michael@0: static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; michael@0: // michael@0: // We're going to create a subimage of the surface with size michael@0: // for alpha recovery, and want a SIMD fast-path. The michael@0: // rect /needs/ to be redrawn, but it might not be michael@0: // properly aligned for SIMD. So we want to find a rect that's a superset of what needs to be redrawn but is michael@0: // properly aligned. Proper alignment is michael@0: // michael@0: // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) michael@0: // BPP * w' \cong BPP * sw (mod ALIGN) michael@0: // michael@0: // (We assume the pixel at surface <0,0> is already ALIGN'd.) michael@0: // That rect (obviously) has to fit within the surface bounds, and michael@0: // we should also minimize the extra pixels redrawn only for michael@0: // alignment's sake. So we also want michael@0: // michael@0: // minimize michael@0: // 0 <= x' <= x michael@0: // 0 <= y' <= y michael@0: // w <= w' <= sw michael@0: // h <= h' <= sh michael@0: // michael@0: // This is a messy integer non-linear programming problem, except michael@0: // ... we can assume that ALIGN/BPP is a very small constant. So, michael@0: // brute force is viable. The algorithm below will find a michael@0: // solution if one exists, but isn't guaranteed to find the michael@0: // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at michael@0: // most 64 iterations below). In what's likely the common case, michael@0: // an already-aligned rectangle, it only needs 1 iteration. michael@0: // michael@0: // Is this alignment worth doing? Recovering alpha will take work michael@0: // proportional to w*h (assuming alpha recovery computation isn't michael@0: // memory bound). This analysis can lead to O(w+h) extra work michael@0: // (with small constants). In exchange, we expect to shave off a michael@0: // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as michael@0: // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We michael@0: // only really care about the w*h >> w+h case anyway; others michael@0: // should be fast enough even with the overhead. (Unless the cost michael@0: // of repainting the expanded rect is high, but in that case michael@0: // SIMD-ized alpha recovery won't make a difference so this code michael@0: // shouldn't be called.) michael@0: // michael@0: gfxIntSize surfaceSize = aSurface->GetSize(); michael@0: const int32_t stride = bpp * surfaceSize.width; michael@0: if (stride != aSurface->Stride()) { michael@0: NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); michael@0: return aRect; michael@0: } michael@0: michael@0: const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height; michael@0: const int32_t r = x + w; michael@0: const int32_t sw = surfaceSize.width; michael@0: const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); michael@0: michael@0: // The outer two loops below keep the rightmost (|r| above) and michael@0: // bottommost pixels in |aRect| fixed wrt , to ensure that we michael@0: // return only a superset of the original rect. These loops michael@0: // search for an aligned top-left pixel by trying to expand michael@0: // left and up by pixels, respectively. michael@0: // michael@0: // Then if a properly-aligned top-left pixel is found, the michael@0: // innermost loop tries to find an aligned stride by moving the michael@0: // rightmost pixel rightward by dr. michael@0: int32_t dx, dy, dr; michael@0: for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { michael@0: for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { michael@0: if (0 != ByteAlignment(kByteAlignLog2, michael@0: bpp * (x - dx), y - dy, stride)) { michael@0: continue; michael@0: } michael@0: for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { michael@0: if (strideAlign == ByteAlignment(kByteAlignLog2, michael@0: bpp * (w + dr + dx))) { michael@0: goto FOUND_SOLUTION; michael@0: } michael@0: } michael@0: } michael@0: } michael@0: michael@0: // Didn't find a solution. michael@0: return aRect; michael@0: michael@0: FOUND_SOLUTION: michael@0: nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy); michael@0: NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution), michael@0: "'Solution' extends outside surface bounds!"); michael@0: return solution; michael@0: }