|
1 /* |
|
2 * Copyright 2011 Google Inc. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license that can be |
|
5 * found in the LICENSE file. |
|
6 */ |
|
7 |
|
8 #include "SkBlitRect_opts_SSE2.h" |
|
9 #include "SkBlitRow.h" |
|
10 #include "SkColorPriv.h" |
|
11 |
|
12 #include <emmintrin.h> |
|
13 |
|
14 /** Simple blitting of opaque rectangles less than 31 pixels wide: |
|
15 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. |
|
16 */ |
|
17 static void BlitRect32_OpaqueNarrow_SSE2(SkPMColor* SK_RESTRICT destination, |
|
18 int width, int height, |
|
19 size_t rowBytes, uint32_t color) { |
|
20 SkASSERT(255 == SkGetPackedA32(color)); |
|
21 SkASSERT(width > 0); |
|
22 SkASSERT(width < 31); |
|
23 |
|
24 while (--height >= 0) { |
|
25 SkPMColor* dst = destination; |
|
26 int count = width; |
|
27 |
|
28 while (count > 4) { |
|
29 *dst++ = color; |
|
30 *dst++ = color; |
|
31 *dst++ = color; |
|
32 *dst++ = color; |
|
33 count -= 4; |
|
34 } |
|
35 |
|
36 while (count > 0) { |
|
37 *dst++ = color; |
|
38 --count; |
|
39 } |
|
40 |
|
41 destination = (uint32_t*)((char*)destination + rowBytes); |
|
42 } |
|
43 } |
|
44 |
|
45 /** |
|
46 Fast blitting of opaque rectangles at least 31 pixels wide: |
|
47 inlines and merges sections of Color32_SSE2 and sk_memset32_SSE2. |
|
48 A 31 pixel rectangle is guaranteed to have at least one |
|
49 16-pixel aligned span that can take advantage of mm_store. |
|
50 */ |
|
51 static void BlitRect32_OpaqueWide_SSE2(SkPMColor* SK_RESTRICT destination, |
|
52 int width, int height, |
|
53 size_t rowBytes, uint32_t color) { |
|
54 SkASSERT(255 == SkGetPackedA32(color)); |
|
55 SkASSERT(width >= 31); |
|
56 |
|
57 __m128i color_wide = _mm_set1_epi32(color); |
|
58 while (--height >= 0) { |
|
59 // Prefetching one row ahead to L1 cache can equal hardware |
|
60 // performance for large/tall rects, but never *beats* |
|
61 // hardware performance. |
|
62 SkPMColor* dst = destination; |
|
63 int count = width; |
|
64 |
|
65 while (((size_t)dst) & 0x0F) { |
|
66 *dst++ = color; |
|
67 --count; |
|
68 } |
|
69 __m128i *d = reinterpret_cast<__m128i*>(dst); |
|
70 |
|
71 // Googling suggests _mm_stream is only going to beat _mm_store |
|
72 // for things that wouldn't fit in L2 cache anyway, typically |
|
73 // >500kB, and precisely fill cache lines. For us, with |
|
74 // arrays > 100k elements _mm_stream is still 100%+ slower than |
|
75 // mm_store. |
|
76 |
|
77 // Unrolling to count >= 64 is a break-even for most |
|
78 // input patterns; we seem to be saturating the bus and having |
|
79 // low enough overhead at 32. |
|
80 |
|
81 while (count >= 32) { |
|
82 _mm_store_si128(d++, color_wide); |
|
83 _mm_store_si128(d++, color_wide); |
|
84 _mm_store_si128(d++, color_wide); |
|
85 _mm_store_si128(d++, color_wide); |
|
86 _mm_store_si128(d++, color_wide); |
|
87 _mm_store_si128(d++, color_wide); |
|
88 _mm_store_si128(d++, color_wide); |
|
89 _mm_store_si128(d++, color_wide); |
|
90 count -= 32; |
|
91 } |
|
92 if (count >= 16) { |
|
93 _mm_store_si128(d++, color_wide); |
|
94 _mm_store_si128(d++, color_wide); |
|
95 _mm_store_si128(d++, color_wide); |
|
96 _mm_store_si128(d++, color_wide); |
|
97 count -= 16; |
|
98 } |
|
99 dst = reinterpret_cast<uint32_t*>(d); |
|
100 |
|
101 // Unrolling the loop in the Narrow code is a significant performance |
|
102 // gain, but unrolling this loop appears to make no difference in |
|
103 // benchmarks with either mm_store_si128 or individual sets. |
|
104 |
|
105 while (count > 0) { |
|
106 *dst++ = color; |
|
107 --count; |
|
108 } |
|
109 |
|
110 destination = (uint32_t*)((char*)destination + rowBytes); |
|
111 } |
|
112 } |
|
113 |
|
114 void ColorRect32_SSE2(SkPMColor* destination, |
|
115 int width, int height, |
|
116 size_t rowBytes, uint32_t color) { |
|
117 if (0 == height || 0 == width || 0 == color) { |
|
118 return; |
|
119 } |
|
120 unsigned colorA = SkGetPackedA32(color); |
|
121 colorA = 0; // skip below if () for now...(has been disabled since this was added in r3423). |
|
122 if (255 == colorA) { |
|
123 if (width < 31) { |
|
124 BlitRect32_OpaqueNarrow_SSE2(destination, width, height, |
|
125 rowBytes, color); |
|
126 } else { |
|
127 BlitRect32_OpaqueWide_SSE2(destination, width, height, |
|
128 rowBytes, color); |
|
129 } |
|
130 } else { |
|
131 SkBlitRow::ColorRect32(destination, width, height, rowBytes, color); |
|
132 } |
|
133 } |