|
1 /* -*- Mode: C++; tab-width: 20; indent-tabs-mode: nil; c-basic-offset: 4 -*- |
|
2 * This Source Code Form is subject to the terms of the Mozilla Public |
|
3 * License, v. 2.0. If a copy of the MPL was not distributed with this |
|
4 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */ |
|
5 |
|
6 #include "gfxAlphaRecovery.h" |
|
7 #include "gfxImageSurface.h" |
|
8 #include "nsRect.h" |
|
9 #include <emmintrin.h> |
|
10 |
|
11 // This file should only be compiled on x86 and x64 systems. Additionally, |
|
12 // you'll need to compile it with -msse2 if you're using GCC on x86. |
|
13 |
|
14 #if defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_AMD64)) |
|
15 __declspec(align(16)) static uint32_t greenMaski[] = |
|
16 { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
|
17 __declspec(align(16)) static uint32_t alphaMaski[] = |
|
18 { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
|
19 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) |
|
20 static uint32_t greenMaski[] __attribute__ ((aligned (16))) = |
|
21 { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
|
22 static uint32_t alphaMaski[] __attribute__ ((aligned (16))) = |
|
23 { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
|
24 #elif defined(__SUNPRO_CC) && (defined(__i386) || defined(__x86_64__)) |
|
25 #pragma align 16 (greenMaski, alphaMaski) |
|
26 static uint32_t greenMaski[] = { 0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00 }; |
|
27 static uint32_t alphaMaski[] = { 0xff000000, 0xff000000, 0xff000000, 0xff000000 }; |
|
28 #endif |
|
29 |
|
30 bool |
|
31 gfxAlphaRecovery::RecoverAlphaSSE2(gfxImageSurface* blackSurf, |
|
32 const gfxImageSurface* whiteSurf) |
|
33 { |
|
34 gfxIntSize size = blackSurf->GetSize(); |
|
35 |
|
36 if (size != whiteSurf->GetSize() || |
|
37 (blackSurf->Format() != gfxImageFormat::ARGB32 && |
|
38 blackSurf->Format() != gfxImageFormat::RGB24) || |
|
39 (whiteSurf->Format() != gfxImageFormat::ARGB32 && |
|
40 whiteSurf->Format() != gfxImageFormat::RGB24)) |
|
41 return false; |
|
42 |
|
43 blackSurf->Flush(); |
|
44 whiteSurf->Flush(); |
|
45 |
|
46 unsigned char* blackData = blackSurf->Data(); |
|
47 unsigned char* whiteData = whiteSurf->Data(); |
|
48 |
|
49 if ((NS_PTR_TO_UINT32(blackData) & 0xf) != (NS_PTR_TO_UINT32(whiteData) & 0xf) || |
|
50 (blackSurf->Stride() - whiteSurf->Stride()) & 0xf) { |
|
51 // Cannot keep these in alignment. |
|
52 return false; |
|
53 } |
|
54 |
|
55 __m128i greenMask = _mm_load_si128((__m128i*)greenMaski); |
|
56 __m128i alphaMask = _mm_load_si128((__m128i*)alphaMaski); |
|
57 |
|
58 for (int32_t i = 0; i < size.height; ++i) { |
|
59 int32_t j = 0; |
|
60 // Loop single pixels until at 4 byte alignment. |
|
61 while (NS_PTR_TO_UINT32(blackData) & 0xf && j < size.width) { |
|
62 *((uint32_t*)blackData) = |
|
63 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), |
|
64 *reinterpret_cast<uint32_t*>(whiteData)); |
|
65 blackData += 4; |
|
66 whiteData += 4; |
|
67 j++; |
|
68 } |
|
69 // This extra loop allows the compiler to do some more clever registry |
|
70 // management and makes it about 5% faster than with only the 4 pixel |
|
71 // at a time loop. |
|
72 for (; j < size.width - 8; j += 8) { |
|
73 __m128i black1 = _mm_load_si128((__m128i*)blackData); |
|
74 __m128i white1 = _mm_load_si128((__m128i*)whiteData); |
|
75 __m128i black2 = _mm_load_si128((__m128i*)(blackData + 16)); |
|
76 __m128i white2 = _mm_load_si128((__m128i*)(whiteData + 16)); |
|
77 |
|
78 // Execute the same instructions as described in RecoverPixel, only |
|
79 // using an SSE2 packed saturated subtract. |
|
80 white1 = _mm_subs_epu8(white1, black1); |
|
81 white2 = _mm_subs_epu8(white2, black2); |
|
82 white1 = _mm_subs_epu8(greenMask, white1); |
|
83 white2 = _mm_subs_epu8(greenMask, white2); |
|
84 // Producing the final black pixel in an XMM register and storing |
|
85 // that is actually faster than doing a masked store since that |
|
86 // does an unaligned storage. We have the black pixel in a register |
|
87 // anyway. |
|
88 black1 = _mm_andnot_si128(alphaMask, black1); |
|
89 black2 = _mm_andnot_si128(alphaMask, black2); |
|
90 white1 = _mm_slli_si128(white1, 2); |
|
91 white2 = _mm_slli_si128(white2, 2); |
|
92 white1 = _mm_and_si128(alphaMask, white1); |
|
93 white2 = _mm_and_si128(alphaMask, white2); |
|
94 black1 = _mm_or_si128(white1, black1); |
|
95 black2 = _mm_or_si128(white2, black2); |
|
96 |
|
97 _mm_store_si128((__m128i*)blackData, black1); |
|
98 _mm_store_si128((__m128i*)(blackData + 16), black2); |
|
99 blackData += 32; |
|
100 whiteData += 32; |
|
101 } |
|
102 for (; j < size.width - 4; j += 4) { |
|
103 __m128i black = _mm_load_si128((__m128i*)blackData); |
|
104 __m128i white = _mm_load_si128((__m128i*)whiteData); |
|
105 |
|
106 white = _mm_subs_epu8(white, black); |
|
107 white = _mm_subs_epu8(greenMask, white); |
|
108 black = _mm_andnot_si128(alphaMask, black); |
|
109 white = _mm_slli_si128(white, 2); |
|
110 white = _mm_and_si128(alphaMask, white); |
|
111 black = _mm_or_si128(white, black); |
|
112 _mm_store_si128((__m128i*)blackData, black); |
|
113 blackData += 16; |
|
114 whiteData += 16; |
|
115 } |
|
116 // Loop single pixels until we're done. |
|
117 while (j < size.width) { |
|
118 *((uint32_t*)blackData) = |
|
119 RecoverPixel(*reinterpret_cast<uint32_t*>(blackData), |
|
120 *reinterpret_cast<uint32_t*>(whiteData)); |
|
121 blackData += 4; |
|
122 whiteData += 4; |
|
123 j++; |
|
124 } |
|
125 blackData += blackSurf->Stride() - j * 4; |
|
126 whiteData += whiteSurf->Stride() - j * 4; |
|
127 } |
|
128 |
|
129 blackSurf->MarkDirty(); |
|
130 |
|
131 return true; |
|
132 } |
|
133 |
|
134 static int32_t |
|
135 ByteAlignment(int32_t aAlignToLog2, int32_t aX, int32_t aY=0, int32_t aStride=1) |
|
136 { |
|
137 return (aX + aStride * aY) & ((1 << aAlignToLog2) - 1); |
|
138 } |
|
139 |
|
140 /*static*/ nsIntRect |
|
141 gfxAlphaRecovery::AlignRectForSubimageRecovery(const nsIntRect& aRect, |
|
142 gfxImageSurface* aSurface) |
|
143 { |
|
144 NS_ASSERTION(gfxImageFormat::ARGB32 == aSurface->Format(), |
|
145 "Thebes grew support for non-ARGB32 COLOR_ALPHA?"); |
|
146 static const int32_t kByteAlignLog2 = GoodAlignmentLog2(); |
|
147 static const int32_t bpp = 4; |
|
148 static const int32_t pixPerAlign = (1 << kByteAlignLog2) / bpp; |
|
149 // |
|
150 // We're going to create a subimage of the surface with size |
|
151 // <sw,sh> for alpha recovery, and want a SIMD fast-path. The |
|
152 // rect <x,y, w,h> /needs/ to be redrawn, but it might not be |
|
153 // properly aligned for SIMD. So we want to find a rect <x',y', |
|
154 // w',h'> that's a superset of what needs to be redrawn but is |
|
155 // properly aligned. Proper alignment is |
|
156 // |
|
157 // BPP * (x' + y' * sw) \cong 0 (mod ALIGN) |
|
158 // BPP * w' \cong BPP * sw (mod ALIGN) |
|
159 // |
|
160 // (We assume the pixel at surface <0,0> is already ALIGN'd.) |
|
161 // That rect (obviously) has to fit within the surface bounds, and |
|
162 // we should also minimize the extra pixels redrawn only for |
|
163 // alignment's sake. So we also want |
|
164 // |
|
165 // minimize <x',y', w',h'> |
|
166 // 0 <= x' <= x |
|
167 // 0 <= y' <= y |
|
168 // w <= w' <= sw |
|
169 // h <= h' <= sh |
|
170 // |
|
171 // This is a messy integer non-linear programming problem, except |
|
172 // ... we can assume that ALIGN/BPP is a very small constant. So, |
|
173 // brute force is viable. The algorithm below will find a |
|
174 // solution if one exists, but isn't guaranteed to find the |
|
175 // minimum solution. (For SSE2, ALIGN/BPP = 4, so it'll do at |
|
176 // most 64 iterations below). In what's likely the common case, |
|
177 // an already-aligned rectangle, it only needs 1 iteration. |
|
178 // |
|
179 // Is this alignment worth doing? Recovering alpha will take work |
|
180 // proportional to w*h (assuming alpha recovery computation isn't |
|
181 // memory bound). This analysis can lead to O(w+h) extra work |
|
182 // (with small constants). In exchange, we expect to shave off a |
|
183 // ALIGN/BPP constant by using SIMD-ized alpha recovery. So as |
|
184 // w*h diverges from w+h, the win factor approaches ALIGN/BPP. We |
|
185 // only really care about the w*h >> w+h case anyway; others |
|
186 // should be fast enough even with the overhead. (Unless the cost |
|
187 // of repainting the expanded rect is high, but in that case |
|
188 // SIMD-ized alpha recovery won't make a difference so this code |
|
189 // shouldn't be called.) |
|
190 // |
|
191 gfxIntSize surfaceSize = aSurface->GetSize(); |
|
192 const int32_t stride = bpp * surfaceSize.width; |
|
193 if (stride != aSurface->Stride()) { |
|
194 NS_WARNING("Unexpected stride, falling back on slow alpha recovery"); |
|
195 return aRect; |
|
196 } |
|
197 |
|
198 const int32_t x = aRect.x, y = aRect.y, w = aRect.width, h = aRect.height; |
|
199 const int32_t r = x + w; |
|
200 const int32_t sw = surfaceSize.width; |
|
201 const int32_t strideAlign = ByteAlignment(kByteAlignLog2, stride); |
|
202 |
|
203 // The outer two loops below keep the rightmost (|r| above) and |
|
204 // bottommost pixels in |aRect| fixed wrt <x,y>, to ensure that we |
|
205 // return only a superset of the original rect. These loops |
|
206 // search for an aligned top-left pixel by trying to expand <x,y> |
|
207 // left and up by <dx,dy> pixels, respectively. |
|
208 // |
|
209 // Then if a properly-aligned top-left pixel is found, the |
|
210 // innermost loop tries to find an aligned stride by moving the |
|
211 // rightmost pixel rightward by dr. |
|
212 int32_t dx, dy, dr; |
|
213 for (dy = 0; (dy < pixPerAlign) && (y - dy >= 0); ++dy) { |
|
214 for (dx = 0; (dx < pixPerAlign) && (x - dx >= 0); ++dx) { |
|
215 if (0 != ByteAlignment(kByteAlignLog2, |
|
216 bpp * (x - dx), y - dy, stride)) { |
|
217 continue; |
|
218 } |
|
219 for (dr = 0; (dr < pixPerAlign) && (r + dr <= sw); ++dr) { |
|
220 if (strideAlign == ByteAlignment(kByteAlignLog2, |
|
221 bpp * (w + dr + dx))) { |
|
222 goto FOUND_SOLUTION; |
|
223 } |
|
224 } |
|
225 } |
|
226 } |
|
227 |
|
228 // Didn't find a solution. |
|
229 return aRect; |
|
230 |
|
231 FOUND_SOLUTION: |
|
232 nsIntRect solution = nsIntRect(x - dx, y - dy, w + dr + dx, h + dy); |
|
233 NS_ABORT_IF_FALSE(nsIntRect(0, 0, sw, surfaceSize.height).Contains(solution), |
|
234 "'Solution' extends outside surface bounds!"); |
|
235 return solution; |
|
236 } |