Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
michael@0 | 2 | * This Source Code Form is subject to the terms of the Mozilla Public |
michael@0 | 3 | * License, v. 2.0. If a copy of the MPL was not distributed with this |
michael@0 | 4 | * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
michael@0 | 5 | |
michael@0 | 6 | #include "gfxAlphaRecovery.h" |
michael@0 | 7 | #include "gfxImageSurface.h" |
michael@0 | 8 | #include "nsRect.h" |
michael@0 | 9 | #include <emmintrin.h> |
michael@0 | 10 | |
michael@0 | 11 | // This file should only be compiled on x86 and x64 systems. Additionally, |
michael@0 | 12 | // you'll need to compile it with -msse2 if you're using GCC on x86. |
michael@0 | 13 | |
michael@0 | 14 | #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) |
michael@0 | 15 | __declspec(align(16)) static uint32_t greenMaski[] = |
michael@0 | 16 | { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
michael@0 | 17 | __declspec(align(16)) static uint32_t alphaMaski[] = |
michael@0 | 18 | { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
michael@0 | 19 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
michael@0 | 20 | static uint32_t greenMaski[] __attribute__ ((aligned (16))) = |
michael@0 | 21 | { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
michael@0 | 22 | static uint32_t alphaMaski[] __attribute__ ((aligned (16))) = |
michael@0 | 23 | { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
michael@0 | 24 | #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) |
michael@0 | 25 | #pragma align 16 (greenMaski, alphaMaski) |
michael@0 | 26 | static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
michael@0 | 27 | static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
michael@0 | 28 | #endif |
michael@0 | 29 | |
michael@0 | 30 | bool |
michael@0 | 31 | gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, |
michael@0 | 32 | const gfxImageSurface* whiteSurf) |
michael@0 | 33 | { |
michael@0 | 34 | gfxIntSize size = blackSurf->GetSize(); |
michael@0 | 35 | |
michael@0 | 36 | if (size != whiteSurf->GetSize() || |
michael@0 | 37 | (blackSurf->Format() != gfxImageFormat::ARGB32 && |
michael@0 | 38 | blackSurf->Format() != gfxImageFormat::RGB24) || |
michael@0 | 39 | (whiteSurf->Format() != gfxImageFormat::ARGB32 && |
michael@0 | 40 | whiteSurf->Format() != gfxImageFormat::RGB24)) |
michael@0 | 41 | return false; |
michael@0 | 42 | |
michael@0 | 43 | blackSurf->Flush(); |
michael@0 | 44 | whiteSurf->Flush(); |
michael@0 | 45 | |
michael@0 | 46 | unsigned char* blackData = blackSurf->Data(); |
michael@0 | 47 | unsigned char* whiteData = whiteSurf->Data(); |
michael@0 | 48 | |
michael@0 | 49 | if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || |
michael@0 | 50 | (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { |
michael@0 | 51 | // Cannot keep these in alignment. |
michael@0 | 52 | return false; |
michael@0 | 53 | } |
michael@0 | 54 | |
michael@0 | 55 | __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); |
michael@0 | 56 | __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); |
michael@0 | 57 | |
michael@0 | 58 | for (int32_t i = 0; i < size.height; ++i) { |
michael@0 | 59 | int32_t j = 0; |
michael@0 | 60 | // Loop single pixels until at 4 byte alignment. |
michael@0 | 61 | while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { |
michael@0 | 62 | *((uint32_t*)blackData) = |
michael@0 | 63 | RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), |
michael@0 | 64 | *reinterpret_cast<uint32_t*>(whiteData)); |
michael@0 | 65 | blackData += 4; |
michael@0 | 66 | whiteData += 4; |
michael@0 | 67 | j++; |
michael@0 | 68 | } |
michael@0 | 69 | // This extra loop allows the compiler to do some more clever registry |
michael@0 | 70 | // management and makes it about 5% faster than with only the 4 pixel |
michael@0 | 71 | // at a time loop. |
michael@0 | 72 | for (; j < size.width - 8; j += 8) { |
michael@0 | 73 | __m128i black1 = _mm_load_si128((__m128i*)blackData); |
michael@0 | 74 | __m128i white1 = _mm_load_si128((__m128i*)whiteData); |
michael@0 | 75 | __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); |
michael@0 | 76 | __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); |
michael@0 | 77 | |
michael@0 | 78 | // Execute the same instructions as described in RecoverPixel, only |
michael@0 | 79 | // using an SSE2 packed saturated subtract. |
michael@0 | 80 | white1 = _mm_subs_epu8(white1, black1); |
michael@0 | 81 | white2 = _mm_subs_epu8(white2, black2); |
michael@0 | 82 | white1 = _mm_subs_epu8(greenMask, white1); |
michael@0 | 83 | white2 = _mm_subs_epu8(greenMask, white2); |
michael@0 | 84 | // Producing the final black pixel in an XMM register and storing |
michael@0 | 85 | // that is actually faster than doing a masked store since that |
michael@0 | 86 | // does an unaligned storage. We have the black pixel in a register |
michael@0 | 87 | // anyway. |
michael@0 | 88 | black1 = _mm_andnot_si128(alphaMask, black1); |
michael@0 | 89 | black2 = _mm_andnot_si128(alphaMask, black2); |
michael@0 | 90 | white1 = _mm_slli_si128(white1, 2); |
michael@0 | 91 | white2 = _mm_slli_si128(white2, 2); |
michael@0 | 92 | white1 = _mm_and_si128(alphaMask, white1); |
michael@0 | 93 | white2 = _mm_and_si128(alphaMask, white2); |
michael@0 | 94 | black1 = _mm_or_si128(white1, black1); |
michael@0 | 95 | black2 = _mm_or_si128(white2, black2); |
michael@0 | 96 | |
michael@0 | 97 | _mm_store_si128((__m128i*)blackData, black1); |
michael@0 | 98 | _mm_store_si128((__m128i*)(blackData + 16), black2); |
michael@0 | 99 | blackData += 32; |
michael@0 | 100 | whiteData += 32; |
michael@0 | 101 | } |
michael@0 | 102 | for (; j < size.width - 4; j += 4) { |
michael@0 | 103 | __m128i black = _mm_load_si128((__m128i*)blackData); |
michael@0 | 104 | __m128i white = _mm_load_si128((__m128i*)whiteData); |
michael@0 | 105 | |
michael@0 | 106 | white = _mm_subs_epu8(white, black); |
michael@0 | 107 | white = _mm_subs_epu8(greenMask, white); |
michael@0 | 108 | black = _mm_andnot_si128(alphaMask, black); |
michael@0 | 109 | white = _mm_slli_si128(white, 2); |
michael@0 | 110 | white = _mm_and_si128(alphaMask, white); |
michael@0 | 111 | black = _mm_or_si128(white, black); |
michael@0 | 112 | _mm_store_si128((__m128i*)blackData, black); |
michael@0 | 113 | blackData += 16; |
michael@0 | 114 | whiteData += 16; |
michael@0 | 115 | } |
michael@0 | 116 | // Loop single pixels until we're done. |
michael@0 | 117 | while (j < size.width) { |
michael@0 | 118 | *((uint32_t*)blackData) = |
michael@0 | 119 | RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), |
michael@0 | 120 | *reinterpret_cast<uint32_t*>(whiteData)); |
michael@0 | 121 | blackData += 4; |
michael@0 | 122 | whiteData += 4; |
michael@0 | 123 | j++; |
michael@0 | 124 | } |
michael@0 | 125 | blackData += blackSurf->Stride() - j * 4; |
michael@0 | 126 | whiteData += whiteSurf->Stride() - j * 4; |
michael@0 | 127 | } |
michael@0 | 128 | |
michael@0 | 129 | blackSurf->MarkDirty(); |
michael@0 | 130 | |
michael@0 | 131 | return true; |
michael@0 | 132 | } |
michael@0 | 133 | |
michael@0 | 134 | static int32_t |
michael@0 | 135 | ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1) |
michael@0 | 136 | { |
michael@0 | 137 | return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); |
michael@0 | 138 | } |
michael@0 | 139 | |
michael@0 | 140 | /*static*/ nsIntRect |
michael@0 | 141 | gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect, |
michael@0 | 142 | gfxImageSurface* aSurface) |
michael@0 | 143 | { |
michael@0 | 144 | NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(), |
michael@0 | 145 | "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); |
michael@0 | 146 | static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); |
michael@0 | 147 | static const int32_t bpp = 4; |
michael@0 | 148 | static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; |
michael@0 | 149 | // |
michael@0 | 150 | // We're going to create a subimage of the surface with size |
michael@0 | 151 | // <sw,sh> for alpha recovery, and want a SIMD fast-path. The |
michael@0 | 152 | // rect <x,y, w,h> /needs/ to be redrawn, but it might not be |
michael@0 | 153 | // properly aligned for SIMD. So we want to find a rect <x',y', |
michael@0 | 154 | // w',h'> that's a superset of what needs to be redrawn but is |
michael@0 | 155 | // properly aligned. Proper alignment is |
michael@0 | 156 | // |
michael@0 | 157 | // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) |
michael@0 | 158 | // BPP * w' \cong BPP * sw (mod ALIGN) |
michael@0 | 159 | // |
michael@0 | 160 | // (We assume the pixel at surface <0,0> is already ALIGN'd.) |
michael@0 | 161 | // That rect (obviously) has to fit within the surface bounds, and |
michael@0 | 162 | // we should also minimize the extra pixels redrawn only for |
michael@0 | 163 | // alignment's sake. So we also want |
michael@0 | 164 | // |
michael@0 | 165 | // minimize <x',y', w',h'> |
michael@0 | 166 | // 0 <= x' <= x |
michael@0 | 167 | // 0 <= y' <= y |
michael@0 | 168 | // w <= w' <= sw |
michael@0 | 169 | // h <= h' <= sh |
michael@0 | 170 | // |
michael@0 | 171 | // This is a messy integer non-linear programming problem, except |
michael@0 | 172 | // ... we can assume that ALIGN/BPP is a very small constant. So, |
michael@0 | 173 | // brute force is viable. The algorithm below will find a |
michael@0 | 174 | // solution if one exists, but isn't guaranteed to find the |
michael@0 | 175 | // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at |
michael@0 | 176 | // most 64 iterations below). In what's likely the common case, |
michael@0 | 177 | // an already-aligned rectangle, it only needs 1 iteration. |
michael@0 | 178 | // |
michael@0 | 179 | // Is this alignment worth doing? Recovering alpha will take work |
michael@0 | 180 | // proportional to w*h (assuming alpha recovery computation isn't |
michael@0 | 181 | // memory bound). This analysis can lead to O(w+h) extra work |
michael@0 | 182 | // (with small constants). In exchange, we expect to shave off a |
michael@0 | 183 | // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as |
michael@0 | 184 | // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We |
michael@0 | 185 | // only really care about the w*h >> w+h case anyway; others |
michael@0 | 186 | // should be fast enough even with the overhead. (Unless the cost |
michael@0 | 187 | // of repainting the expanded rect is high, but in that case |
michael@0 | 188 | // SIMD-ized alpha recovery won't make a difference so this code |
michael@0 | 189 | // shouldn't be called.) |
michael@0 | 190 | // |
michael@0 | 191 | gfxIntSize surfaceSize = aSurface->GetSize(); |
michael@0 | 192 | const int32_t stride = bpp * surfaceSize.width; |
michael@0 | 193 | if (stride != aSurface->Stride()) { |
michael@0 | 194 | NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); |
michael@0 | 195 | return aRect; |
michael@0 | 196 | } |
michael@0 | 197 | |
michael@0 | 198 | const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height; |
michael@0 | 199 | const int32_t r = x + w; |
michael@0 | 200 | const int32_t sw = surfaceSize.width; |
michael@0 | 201 | const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); |
michael@0 | 202 | |
michael@0 | 203 | // The outer two loops below keep the rightmost (|r| above) and |
michael@0 | 204 | // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we |
michael@0 | 205 | // return only a superset of the original rect. These loops |
michael@0 | 206 | // search for an aligned top-left pixel by trying to expand <x,y> |
michael@0 | 207 | // left and up by <dx,dy> pixels, respectively. |
michael@0 | 208 | // |
michael@0 | 209 | // Then if a properly-aligned top-left pixel is found, the |
michael@0 | 210 | // innermost loop tries to find an aligned stride by moving the |
michael@0 | 211 | // rightmost pixel rightward by dr. |
michael@0 | 212 | int32_t dx, dy, dr; |
michael@0 | 213 | for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { |
michael@0 | 214 | for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { |
michael@0 | 215 | if (0 != ByteAlignment(kByteAlignLog2, |
michael@0 | 216 | bpp * (x - dx), y - dy, stride)) { |
michael@0 | 217 | continue; |
michael@0 | 218 | } |
michael@0 | 219 | for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { |
michael@0 | 220 | if (strideAlign == ByteAlignment(kByteAlignLog2, |
michael@0 | 221 | bpp * (w + dr + dx))) { |
michael@0 | 222 | goto FOUND_SOLUTION; |
michael@0 | 223 | } |
michael@0 | 224 | } |
michael@0 | 225 | } |
michael@0 | 226 | } |
michael@0 | 227 | |
michael@0 | 228 | // Didn't find a solution. |
michael@0 | 229 | return aRect; |
michael@0 | 230 | |
michael@0 | 231 | FOUND_SOLUTION: |
michael@0 | 232 | nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy); |
michael@0 | 233 | NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution), |
michael@0 | 234 | "'Solution' extends outside surface bounds!"); |
michael@0 | 235 | return solution; |
michael@0 | 236 | } |