gfx/thebes/gfxAlphaRecoverySSE2.cpp

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

michael@0 1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
michael@0 2 * This Source Code Form is subject to the terms of the Mozilla Public
michael@0 3 * License, v. 2.0. If a copy of the MPL was not distributed with this
michael@0 4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
michael@0 5
michael@0 6 #include "gfxAlphaRecovery.h"
michael@0 7 #include "gfxImageSurface.h"
michael@0 8 #include "nsRect.h"
michael@0 9 #include <emmintrin.h>
michael@0 10
michael@0 11 // This file should only be compiled on x86 and x64 systems. Additionally,
michael@0 12 // you'll need to compile it with -msse2 if you're using GCC on x86.
michael@0 13
michael@0 14 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
michael@0 15 __declspec(align(16)) static uint32_t greenMaski[] =
michael@0 16 { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
michael@0 17 __declspec(align(16)) static uint32_t alphaMaski[] =
michael@0 18 { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
michael@0 19 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
michael@0 20 static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
michael@0 21 { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
michael@0 22 static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
michael@0 23 { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
michael@0 24 #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
michael@0 25 #pragma align 16 (greenMaski, alphaMaski)
michael@0 26 static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
michael@0 27 static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
michael@0 28 #endif
michael@0 29
michael@0 30 bool
michael@0 31 gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
michael@0 32 const gfxImageSurface* whiteSurf)
michael@0 33 {
michael@0 34 gfxIntSize size = blackSurf->GetSize();
michael@0 35
michael@0 36 if (size != whiteSurf->GetSize() ||
michael@0 37 (blackSurf->Format() != gfxImageFormat::ARGB32 &&
michael@0 38 blackSurf->Format() != gfxImageFormat::RGB24) ||
michael@0 39 (whiteSurf->Format() != gfxImageFormat::ARGB32 &&
michael@0 40 whiteSurf->Format() != gfxImageFormat::RGB24))
michael@0 41 return false;
michael@0 42
michael@0 43 blackSurf->Flush();
michael@0 44 whiteSurf->Flush();
michael@0 45
michael@0 46 unsigned char* blackData = blackSurf->Data();
michael@0 47 unsigned char* whiteData = whiteSurf->Data();
michael@0 48
michael@0 49 if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
michael@0 50 (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
michael@0 51 // Cannot keep these in alignment.
michael@0 52 return false;
michael@0 53 }
michael@0 54
michael@0 55 __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
michael@0 56 __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
michael@0 57
michael@0 58 for (int32_t i = 0; i < size.height; ++i) {
michael@0 59 int32_t j = 0;
michael@0 60 // Loop single pixels until at 4 byte alignment.
michael@0 61 while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
michael@0 62 *((uint32_t*)blackData) =
michael@0 63 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
michael@0 64 *reinterpret_cast<uint32_t*>(whiteData));
michael@0 65 blackData += 4;
michael@0 66 whiteData += 4;
michael@0 67 j++;
michael@0 68 }
michael@0 69 // This extra loop allows the compiler to do some more clever registry
michael@0 70 // management and makes it about 5% faster than with only the 4 pixel
michael@0 71 // at a time loop.
michael@0 72 for (; j < size.width - 8; j += 8) {
michael@0 73 __m128i black1 = _mm_load_si128((__m128i*)blackData);
michael@0 74 __m128i white1 = _mm_load_si128((__m128i*)whiteData);
michael@0 75 __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
michael@0 76 __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
michael@0 77
michael@0 78 // Execute the same instructions as described in RecoverPixel, only
michael@0 79 // using an SSE2 packed saturated subtract.
michael@0 80 white1 = _mm_subs_epu8(white1, black1);
michael@0 81 white2 = _mm_subs_epu8(white2, black2);
michael@0 82 white1 = _mm_subs_epu8(greenMask, white1);
michael@0 83 white2 = _mm_subs_epu8(greenMask, white2);
michael@0 84 // Producing the final black pixel in an XMM register and storing
michael@0 85 // that is actually faster than doing a masked store since that
michael@0 86 // does an unaligned storage. We have the black pixel in a register
michael@0 87 // anyway.
michael@0 88 black1 = _mm_andnot_si128(alphaMask, black1);
michael@0 89 black2 = _mm_andnot_si128(alphaMask, black2);
michael@0 90 white1 = _mm_slli_si128(white1, 2);
michael@0 91 white2 = _mm_slli_si128(white2, 2);
michael@0 92 white1 = _mm_and_si128(alphaMask, white1);
michael@0 93 white2 = _mm_and_si128(alphaMask, white2);
michael@0 94 black1 = _mm_or_si128(white1, black1);
michael@0 95 black2 = _mm_or_si128(white2, black2);
michael@0 96
michael@0 97 _mm_store_si128((__m128i*)blackData, black1);
michael@0 98 _mm_store_si128((__m128i*)(blackData + 16), black2);
michael@0 99 blackData += 32;
michael@0 100 whiteData += 32;
michael@0 101 }
michael@0 102 for (; j < size.width - 4; j += 4) {
michael@0 103 __m128i black = _mm_load_si128((__m128i*)blackData);
michael@0 104 __m128i white = _mm_load_si128((__m128i*)whiteData);
michael@0 105
michael@0 106 white = _mm_subs_epu8(white, black);
michael@0 107 white = _mm_subs_epu8(greenMask, white);
michael@0 108 black = _mm_andnot_si128(alphaMask, black);
michael@0 109 white = _mm_slli_si128(white, 2);
michael@0 110 white = _mm_and_si128(alphaMask, white);
michael@0 111 black = _mm_or_si128(white, black);
michael@0 112 _mm_store_si128((__m128i*)blackData, black);
michael@0 113 blackData += 16;
michael@0 114 whiteData += 16;
michael@0 115 }
michael@0 116 // Loop single pixels until we're done.
michael@0 117 while (j < size.width) {
michael@0 118 *((uint32_t*)blackData) =
michael@0 119 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
michael@0 120 *reinterpret_cast<uint32_t*>(whiteData));
michael@0 121 blackData += 4;
michael@0 122 whiteData += 4;
michael@0 123 j++;
michael@0 124 }
michael@0 125 blackData += blackSurf->Stride() - j * 4;
michael@0 126 whiteData += whiteSurf->Stride() - j * 4;
michael@0 127 }
michael@0 128
michael@0 129 blackSurf->MarkDirty();
michael@0 130
michael@0 131 return true;
michael@0 132 }
michael@0 133
michael@0 134 static int32_t
michael@0 135 ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
michael@0 136 {
michael@0 137 return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
michael@0 138 }
michael@0 139
michael@0 140 /*static*/ nsIntRect
michael@0 141 gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect,
michael@0 142 gfxImageSurface* aSurface)
michael@0 143 {
michael@0 144 NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(),
michael@0 145 "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
michael@0 146 static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
michael@0 147 static const int32_t bpp = 4;
michael@0 148 static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
michael@0 149 //
michael@0 150 // We're going to create a subimage of the surface with size
michael@0 151 // <sw,sh> for alpha recovery, and want a SIMD fast-path. The
michael@0 152 // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
michael@0 153 // properly aligned for SIMD. So we want to find a rect <x',y',
michael@0 154 // w',h'> that's a superset of what needs to be redrawn but is
michael@0 155 // properly aligned. Proper alignment is
michael@0 156 //
michael@0 157 // BPP * (x' + y' * sw) \cong 0 (mod ALIGN)
michael@0 158 // BPP * w' \cong BPP * sw (mod ALIGN)
michael@0 159 //
michael@0 160 // (We assume the pixel at surface <0,0> is already ALIGN'd.)
michael@0 161 // That rect (obviously) has to fit within the surface bounds, and
michael@0 162 // we should also minimize the extra pixels redrawn only for
michael@0 163 // alignment's sake. So we also want
michael@0 164 //
michael@0 165 // minimize <x',y', w',h'>
michael@0 166 // 0 <= x' <= x
michael@0 167 // 0 <= y' <= y
michael@0 168 // w <= w' <= sw
michael@0 169 // h <= h' <= sh
michael@0 170 //
michael@0 171 // This is a messy integer non-linear programming problem, except
michael@0 172 // ... we can assume that ALIGN/BPP is a very small constant. So,
michael@0 173 // brute force is viable. The algorithm below will find a
michael@0 174 // solution if one exists, but isn't guaranteed to find the
michael@0 175 // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at
michael@0 176 // most 64 iterations below). In what's likely the common case,
michael@0 177 // an already-aligned rectangle, it only needs 1 iteration.
michael@0 178 //
michael@0 179 // Is this alignment worth doing? Recovering alpha will take work
michael@0 180 // proportional to w*h (assuming alpha recovery computation isn't
michael@0 181 // memory bound). This analysis can lead to O(w+h) extra work
michael@0 182 // (with small constants). In exchange, we expect to shave off a
michael@0 183 // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as
michael@0 184 // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We
michael@0 185 // only really care about the w*h >> w+h case anyway; others
michael@0 186 // should be fast enough even with the overhead. (Unless the cost
michael@0 187 // of repainting the expanded rect is high, but in that case
michael@0 188 // SIMD-ized alpha recovery won't make a difference so this code
michael@0 189 // shouldn't be called.)
michael@0 190 //
michael@0 191 gfxIntSize surfaceSize = aSurface->GetSize();
michael@0 192 const int32_t stride = bpp * surfaceSize.width;
michael@0 193 if (stride != aSurface->Stride()) {
michael@0 194 NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
michael@0 195 return aRect;
michael@0 196 }
michael@0 197
michael@0 198 const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
michael@0 199 const int32_t r = x + w;
michael@0 200 const int32_t sw = surfaceSize.width;
michael@0 201 const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
michael@0 202
michael@0 203 // The outer two loops below keep the rightmost (|r| above) and
michael@0 204 // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
michael@0 205 // return only a superset of the original rect. These loops
michael@0 206 // search for an aligned top-left pixel by trying to expand <x,y>
michael@0 207 // left and up by <dx,dy> pixels, respectively.
michael@0 208 //
michael@0 209 // Then if a properly-aligned top-left pixel is found, the
michael@0 210 // innermost loop tries to find an aligned stride by moving the
michael@0 211 // rightmost pixel rightward by dr.
michael@0 212 int32_t dx, dy, dr;
michael@0 213 for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
michael@0 214 for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
michael@0 215 if (0 != ByteAlignment(kByteAlignLog2,
michael@0 216 bpp * (x - dx), y - dy, stride)) {
michael@0 217 continue;
michael@0 218 }
michael@0 219 for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
michael@0 220 if (strideAlign == ByteAlignment(kByteAlignLog2,
michael@0 221 bpp * (w + dr + dx))) {
michael@0 222 goto FOUND_SOLUTION;
michael@0 223 }
michael@0 224 }
michael@0 225 }
michael@0 226 }
michael@0 227
michael@0 228 // Didn't find a solution.
michael@0 229 return aRect;
michael@0 230
michael@0 231 FOUND_SOLUTION:
michael@0 232 nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy);
michael@0 233 NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution),
michael@0 234 "'Solution' extends outside surface bounds!");
michael@0 235 return solution;
michael@0 236 }

mercurial