gfx/thebes/gfxAlphaRecoverySSE2.cpp

Tue, 06 Jan 2015 21:39:09 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Tue, 06 Jan 2015 21:39:09 +0100
branch
TOR_BUG_9701
changeset 8
97036ab72558
permissions
-rw-r--r--

Conditionally force memory storage according to privacy.thirdparty.isolate;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*-
     2  * This Source Code Form is subject to the terms of the Mozilla Public
     3  * License, v. 2.0. If a copy of the MPL was not distributed with this
     4  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
     6 #include "gfxAlphaRecovery.h"
     7 #include "gfxImageSurface.h"
     8 #include "nsRect.h"
     9 #include <emmintrin.h>
    11 // This file should only be compiled on x86 and x64 systems.  Additionally,
    12 // you'll need to compile it with -msse2 if you're using GCC on x86.
    14 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64))
    15 __declspec(align(16)) static uint32_t greenMaski[] =
    16     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
    17 __declspec(align(16)) static uint32_t alphaMaski[] =
    18     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
    19 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__))
    20 static uint32_t greenMaski[] __attribute__ ((aligned (16))) =
    21     { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
    22 static uint32_t alphaMaski[] __attribute__ ((aligned (16))) =
    23     { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
    24 #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__))
    25 #pragma align 16 (greenMaski, alphaMaski)
    26 static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 };
    27 static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 };
    28 #endif
    30 bool
    31 gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf,
    32                                    const gfxImageSurface* whiteSurf)
    33 {
    34     gfxIntSize size = blackSurf->GetSize();
    36     if (size != whiteSurf->GetSize() ||
    37         (blackSurf->Format() != gfxImageFormat::ARGB32 &&
    38          blackSurf->Format() != gfxImageFormat::RGB24) ||
    39         (whiteSurf->Format() != gfxImageFormat::ARGB32 &&
    40          whiteSurf->Format() != gfxImageFormat::RGB24))
    41         return false;
    43     blackSurf->Flush();
    44     whiteSurf->Flush();
    46     unsigned char* blackData = blackSurf->Data();
    47     unsigned char* whiteData = whiteSurf->Data();
    49     if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) ||
    50         (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) {
    51         // Cannot keep these in alignment.
    52         return false;
    53     }
    55     __m128i greenMask = _mm_load_si128((__m128i*)greenMaski);
    56     __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski);
    58     for (int32_t i = 0; i < size.height; ++i) {
    59         int32_t j = 0;
    60         // Loop single pixels until at 4 byte alignment.
    61         while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) {
    62             *((uint32_t*)blackData) =
    63                 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
    64                              *reinterpret_cast<uint32_t*>(whiteData));
    65             blackData += 4;
    66             whiteData += 4;
    67             j++;
    68         }
    69         // This extra loop allows the compiler to do some more clever registry
    70         // management and makes it about 5% faster than with only the 4 pixel
    71         // at a time loop.
    72         for (; j < size.width - 8; j += 8) {
    73             __m128i black1 = _mm_load_si128((__m128i*)blackData);
    74             __m128i white1 = _mm_load_si128((__m128i*)whiteData);
    75             __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16));
    76             __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16));
    78             // Execute the same instructions as described in RecoverPixel, only
    79             // using an SSE2 packed saturated subtract.
    80             white1 = _mm_subs_epu8(white1, black1);
    81             white2 = _mm_subs_epu8(white2, black2);
    82             white1 = _mm_subs_epu8(greenMask, white1);
    83             white2 = _mm_subs_epu8(greenMask, white2);
    84             // Producing the final black pixel in an XMM register and storing
    85             // that is actually faster than doing a masked store since that
    86             // does an unaligned storage. We have the black pixel in a register
    87             // anyway.
    88             black1 = _mm_andnot_si128(alphaMask, black1);
    89             black2 = _mm_andnot_si128(alphaMask, black2);
    90             white1 = _mm_slli_si128(white1, 2);
    91             white2 = _mm_slli_si128(white2, 2);
    92             white1 = _mm_and_si128(alphaMask, white1);
    93             white2 = _mm_and_si128(alphaMask, white2);
    94             black1 = _mm_or_si128(white1, black1);
    95             black2 = _mm_or_si128(white2, black2);
    97             _mm_store_si128((__m128i*)blackData, black1);
    98             _mm_store_si128((__m128i*)(blackData + 16), black2);
    99             blackData += 32;
   100             whiteData += 32;
   101         }
   102         for (; j < size.width - 4; j += 4) {
   103             __m128i black = _mm_load_si128((__m128i*)blackData);
   104             __m128i white = _mm_load_si128((__m128i*)whiteData);
   106             white = _mm_subs_epu8(white, black);
   107             white = _mm_subs_epu8(greenMask, white);
   108             black = _mm_andnot_si128(alphaMask, black);
   109             white = _mm_slli_si128(white, 2);
   110             white = _mm_and_si128(alphaMask, white);
   111             black = _mm_or_si128(white, black);
   112             _mm_store_si128((__m128i*)blackData, black);
   113             blackData += 16;
   114             whiteData += 16;
   115         }
   116         // Loop single pixels until we're done.
   117         while (j < size.width) {
   118             *((uint32_t*)blackData) =
   119                 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData),
   120                              *reinterpret_cast<uint32_t*>(whiteData));
   121             blackData += 4;
   122             whiteData += 4;
   123             j++;
   124         }
   125         blackData += blackSurf->Stride() - j * 4;
   126         whiteData += whiteSurf->Stride() - j * 4;
   127     }
   129     blackSurf->MarkDirty();
   131     return true;
   132 }
   134 static int32_t
   135 ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1)
   136 {
   137     return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1);
   138 }
   140 /*static*/ nsIntRect
   141 gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect,
   142                                                gfxImageSurface* aSurface)
   143 {
   144     NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(),
   145                  "Thebes grew support for non-ARGB32 COLOR_ALPHA?");
   146     static const int32_t kByteAlignLog2 = GoodAlignmentLog2();
   147     static const int32_t bpp = 4;
   148     static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp;
   149     //
   150     // We're going to create a subimage of the surface with size
   151     // <sw,sh> for alpha recovery, and want a SIMD fast-path.  The
   152     // rect <x,y, w,h> /needs/ to be redrawn, but it might not be
   153     // properly aligned for SIMD.  So we want to find a rect <x',y',
   154     // w',h'> that's a superset of what needs to be redrawn but is
   155     // properly aligned.  Proper alignment is
   156     //
   157     //   BPP * (x' + y' * sw) \cong 0         (mod ALIGN)
   158     //   BPP * w'             \cong BPP * sw  (mod ALIGN)
   159     //
   160     // (We assume the pixel at surface <0,0> is already ALIGN'd.)
   161     // That rect (obviously) has to fit within the surface bounds, and
   162     // we should also minimize the extra pixels redrawn only for
   163     // alignment's sake.  So we also want
   164     //
   165     //  minimize <x',y', w',h'>
   166     //   0 <= x' <= x
   167     //   0 <= y' <= y
   168     //   w <= w' <= sw
   169     //   h <= h' <= sh
   170     //
   171     // This is a messy integer non-linear programming problem, except
   172     // ... we can assume that ALIGN/BPP is a very small constant.  So,
   173     // brute force is viable.  The algorithm below will find a
   174     // solution if one exists, but isn't guaranteed to find the
   175     // minimum solution.  (For SSE2, ALIGN/BPP = 4, so it'll do at
   176     // most 64 iterations below).  In what's likely the common case,
   177     // an already-aligned rectangle, it only needs 1 iteration.
   178     //
   179     // Is this alignment worth doing?  Recovering alpha will take work
   180     // proportional to w*h (assuming alpha recovery computation isn't
   181     // memory bound).  This analysis can lead to O(w+h) extra work
   182     // (with small constants).  In exchange, we expect to shave off a
   183     // ALIGN/BPP constant by using SIMD-ized alpha recovery.  So as
   184     // w*h diverges from w+h, the win factor approaches ALIGN/BPP.  We
   185     // only really care about the w*h >> w+h case anyway; others
   186     // should be fast enough even with the overhead.  (Unless the cost
   187     // of repainting the expanded rect is high, but in that case
   188     // SIMD-ized alpha recovery won't make a difference so this code
   189     // shouldn't be called.)
   190     //
   191     gfxIntSize surfaceSize = aSurface->GetSize();
   192     const int32_t stride = bpp * surfaceSize.width;
   193     if (stride != aSurface->Stride()) {
   194         NS_WARNING("Unexpected stride, falling back on slow alpha recovery");
   195         return aRect;
   196     }
   198     const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height;
   199     const int32_t r = x + w;
   200     const int32_t sw = surfaceSize.width;
   201     const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride);
   203     // The outer two loops below keep the rightmost (|r| above) and
   204     // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we
   205     // return only a superset of the original rect.  These loops
   206     // search for an aligned top-left pixel by trying to expand <x,y>
   207     // left and up by <dx,dy> pixels, respectively.
   208     //
   209     // Then if a properly-aligned top-left pixel is found, the
   210     // innermost loop tries to find an aligned stride by moving the
   211     // rightmost pixel rightward by dr.
   212     int32_t dx, dy, dr;
   213     for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) {
   214         for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) {
   215             if (0 != ByteAlignment(kByteAlignLog2,
   216                                    bpp * (x - dx), y - dy, stride)) {
   217                 continue;
   218             }
   219             for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) {
   220                 if (strideAlign == ByteAlignment(kByteAlignLog2,
   221                                                  bpp * (w + dr + dx))) {
   222                     goto FOUND_SOLUTION;
   223                 }
   224             }
   225         }
   226     }
   228     // Didn't find a solution.
   229     return aRect;
   231 FOUND_SOLUTION:
   232     nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy);
   233     NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution),
   234                       "'Solution' extends outside surface bounds!");
   235     return solution;
   236 }

mercurial