|
1 /* |
|
2 * Copyright © 2008 Rodrigo Kumpera |
|
3 * Copyright © 2008 André Tupinambá |
|
4 * |
|
5 * Permission to use, copy, modify, distribute, and sell this software and its |
|
6 * documentation for any purpose is hereby granted without fee, provided that |
|
7 * the above copyright notice appear in all copies and that both that |
|
8 * copyright notice and this permission notice appear in supporting |
|
9 * documentation, and that the name of Red Hat not be used in advertising or |
|
10 * publicity pertaining to distribution of the software without specific, |
|
11 * written prior permission. Red Hat makes no representations about the |
|
12 * suitability of this software for any purpose. It is provided "as is" |
|
13 * without express or implied warranty. |
|
14 * |
|
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
|
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
|
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
|
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
|
22 * SOFTWARE. |
|
23 * |
|
24 * Author: Rodrigo Kumpera (kumpera@gmail.com) |
|
25 * André Tupinambá (andrelrt@gmail.com) |
|
26 * |
|
27 * Based on work by Owen Taylor and Søren Sandmann |
|
28 */ |
|
29 #ifdef HAVE_CONFIG_H |
|
30 #include <config.h> |
|
31 #endif |
|
32 |
|
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
|
34 #include <emmintrin.h> /* for SSE2 intrinsics */ |
|
35 #include "pixman-private.h" |
|
36 #include "pixman-combine32.h" |
|
37 #include "pixman-inlines.h" |
|
38 |
|
39 static __m128i mask_0080; |
|
40 static __m128i mask_00ff; |
|
41 static __m128i mask_0101; |
|
42 static __m128i mask_ffff; |
|
43 static __m128i mask_ff000000; |
|
44 static __m128i mask_alpha; |
|
45 |
|
46 static __m128i mask_565_r; |
|
47 static __m128i mask_565_g1, mask_565_g2; |
|
48 static __m128i mask_565_b; |
|
49 static __m128i mask_red; |
|
50 static __m128i mask_green; |
|
51 static __m128i mask_blue; |
|
52 |
|
53 static __m128i mask_565_fix_rb; |
|
54 static __m128i mask_565_fix_g; |
|
55 |
|
56 static __m128i mask_565_rb; |
|
57 static __m128i mask_565_pack_multiplier; |
|
58 |
|
59 static force_inline __m128i |
|
60 unpack_32_1x128 (uint32_t data) |
|
61 { |
|
62 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); |
|
63 } |
|
64 |
|
65 static force_inline void |
|
66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) |
|
67 { |
|
68 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); |
|
69 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); |
|
70 } |
|
71 |
|
72 static force_inline __m128i |
|
73 unpack_565_to_8888 (__m128i lo) |
|
74 { |
|
75 __m128i r, g, b, rb, t; |
|
76 |
|
77 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); |
|
78 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); |
|
79 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); |
|
80 |
|
81 rb = _mm_or_si128 (r, b); |
|
82 t = _mm_and_si128 (rb, mask_565_fix_rb); |
|
83 t = _mm_srli_epi32 (t, 5); |
|
84 rb = _mm_or_si128 (rb, t); |
|
85 |
|
86 t = _mm_and_si128 (g, mask_565_fix_g); |
|
87 t = _mm_srli_epi32 (t, 6); |
|
88 g = _mm_or_si128 (g, t); |
|
89 |
|
90 return _mm_or_si128 (rb, g); |
|
91 } |
|
92 |
|
93 static force_inline void |
|
94 unpack_565_128_4x128 (__m128i data, |
|
95 __m128i* data0, |
|
96 __m128i* data1, |
|
97 __m128i* data2, |
|
98 __m128i* data3) |
|
99 { |
|
100 __m128i lo, hi; |
|
101 |
|
102 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); |
|
103 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); |
|
104 |
|
105 lo = unpack_565_to_8888 (lo); |
|
106 hi = unpack_565_to_8888 (hi); |
|
107 |
|
108 unpack_128_2x128 (lo, data0, data1); |
|
109 unpack_128_2x128 (hi, data2, data3); |
|
110 } |
|
111 |
|
112 static force_inline uint16_t |
|
113 pack_565_32_16 (uint32_t pixel) |
|
114 { |
|
115 return (uint16_t) (((pixel >> 8) & 0xf800) | |
|
116 ((pixel >> 5) & 0x07e0) | |
|
117 ((pixel >> 3) & 0x001f)); |
|
118 } |
|
119 |
|
120 static force_inline __m128i |
|
121 pack_2x128_128 (__m128i lo, __m128i hi) |
|
122 { |
|
123 return _mm_packus_epi16 (lo, hi); |
|
124 } |
|
125 |
|
126 static force_inline __m128i |
|
127 pack_565_2packedx128_128 (__m128i lo, __m128i hi) |
|
128 { |
|
129 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); |
|
130 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); |
|
131 |
|
132 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); |
|
133 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); |
|
134 |
|
135 __m128i g0 = _mm_and_si128 (lo, mask_green); |
|
136 __m128i g1 = _mm_and_si128 (hi, mask_green); |
|
137 |
|
138 t0 = _mm_or_si128 (t0, g0); |
|
139 t1 = _mm_or_si128 (t1, g1); |
|
140 |
|
141 /* Simulates _mm_packus_epi32 */ |
|
142 t0 = _mm_slli_epi32 (t0, 16 - 5); |
|
143 t1 = _mm_slli_epi32 (t1, 16 - 5); |
|
144 t0 = _mm_srai_epi32 (t0, 16); |
|
145 t1 = _mm_srai_epi32 (t1, 16); |
|
146 return _mm_packs_epi32 (t0, t1); |
|
147 } |
|
148 |
|
149 static force_inline __m128i |
|
150 pack_565_2x128_128 (__m128i lo, __m128i hi) |
|
151 { |
|
152 __m128i data; |
|
153 __m128i r, g1, g2, b; |
|
154 |
|
155 data = pack_2x128_128 (lo, hi); |
|
156 |
|
157 r = _mm_and_si128 (data, mask_565_r); |
|
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); |
|
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); |
|
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); |
|
161 |
|
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); |
|
163 } |
|
164 |
|
165 static force_inline __m128i |
|
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) |
|
167 { |
|
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), |
|
169 pack_565_2x128_128 (*xmm2, *xmm3)); |
|
170 } |
|
171 |
|
172 static force_inline int |
|
173 is_opaque (__m128i x) |
|
174 { |
|
175 __m128i ffs = _mm_cmpeq_epi8 (x, x); |
|
176 |
|
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; |
|
178 } |
|
179 |
|
180 static force_inline int |
|
181 is_zero (__m128i x) |
|
182 { |
|
183 return _mm_movemask_epi8 ( |
|
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; |
|
185 } |
|
186 |
|
187 static force_inline int |
|
188 is_transparent (__m128i x) |
|
189 { |
|
190 return (_mm_movemask_epi8 ( |
|
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; |
|
192 } |
|
193 |
|
194 static force_inline __m128i |
|
195 expand_pixel_32_1x128 (uint32_t data) |
|
196 { |
|
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); |
|
198 } |
|
199 |
|
200 static force_inline __m128i |
|
201 expand_alpha_1x128 (__m128i data) |
|
202 { |
|
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, |
|
204 _MM_SHUFFLE (3, 3, 3, 3)), |
|
205 _MM_SHUFFLE (3, 3, 3, 3)); |
|
206 } |
|
207 |
|
208 static force_inline void |
|
209 expand_alpha_2x128 (__m128i data_lo, |
|
210 __m128i data_hi, |
|
211 __m128i* alpha_lo, |
|
212 __m128i* alpha_hi) |
|
213 { |
|
214 __m128i lo, hi; |
|
215 |
|
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); |
|
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); |
|
218 |
|
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); |
|
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); |
|
221 } |
|
222 |
|
223 static force_inline void |
|
224 expand_alpha_rev_2x128 (__m128i data_lo, |
|
225 __m128i data_hi, |
|
226 __m128i* alpha_lo, |
|
227 __m128i* alpha_hi) |
|
228 { |
|
229 __m128i lo, hi; |
|
230 |
|
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); |
|
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); |
|
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); |
|
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); |
|
235 } |
|
236 |
|
237 static force_inline void |
|
238 pix_multiply_2x128 (__m128i* data_lo, |
|
239 __m128i* data_hi, |
|
240 __m128i* alpha_lo, |
|
241 __m128i* alpha_hi, |
|
242 __m128i* ret_lo, |
|
243 __m128i* ret_hi) |
|
244 { |
|
245 __m128i lo, hi; |
|
246 |
|
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); |
|
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); |
|
249 lo = _mm_adds_epu16 (lo, mask_0080); |
|
250 hi = _mm_adds_epu16 (hi, mask_0080); |
|
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); |
|
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); |
|
253 } |
|
254 |
|
255 static force_inline void |
|
256 pix_add_multiply_2x128 (__m128i* src_lo, |
|
257 __m128i* src_hi, |
|
258 __m128i* alpha_dst_lo, |
|
259 __m128i* alpha_dst_hi, |
|
260 __m128i* dst_lo, |
|
261 __m128i* dst_hi, |
|
262 __m128i* alpha_src_lo, |
|
263 __m128i* alpha_src_hi, |
|
264 __m128i* ret_lo, |
|
265 __m128i* ret_hi) |
|
266 { |
|
267 __m128i t1_lo, t1_hi; |
|
268 __m128i t2_lo, t2_hi; |
|
269 |
|
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); |
|
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); |
|
272 |
|
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); |
|
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); |
|
275 } |
|
276 |
|
277 static force_inline void |
|
278 negate_2x128 (__m128i data_lo, |
|
279 __m128i data_hi, |
|
280 __m128i* neg_lo, |
|
281 __m128i* neg_hi) |
|
282 { |
|
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); |
|
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); |
|
285 } |
|
286 |
|
287 static force_inline void |
|
288 invert_colors_2x128 (__m128i data_lo, |
|
289 __m128i data_hi, |
|
290 __m128i* inv_lo, |
|
291 __m128i* inv_hi) |
|
292 { |
|
293 __m128i lo, hi; |
|
294 |
|
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); |
|
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); |
|
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); |
|
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); |
|
299 } |
|
300 |
|
301 static force_inline void |
|
302 over_2x128 (__m128i* src_lo, |
|
303 __m128i* src_hi, |
|
304 __m128i* alpha_lo, |
|
305 __m128i* alpha_hi, |
|
306 __m128i* dst_lo, |
|
307 __m128i* dst_hi) |
|
308 { |
|
309 __m128i t1, t2; |
|
310 |
|
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); |
|
312 |
|
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); |
|
314 |
|
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); |
|
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); |
|
317 } |
|
318 |
|
319 static force_inline void |
|
320 over_rev_non_pre_2x128 (__m128i src_lo, |
|
321 __m128i src_hi, |
|
322 __m128i* dst_lo, |
|
323 __m128i* dst_hi) |
|
324 { |
|
325 __m128i lo, hi; |
|
326 __m128i alpha_lo, alpha_hi; |
|
327 |
|
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); |
|
329 |
|
330 lo = _mm_or_si128 (alpha_lo, mask_alpha); |
|
331 hi = _mm_or_si128 (alpha_hi, mask_alpha); |
|
332 |
|
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); |
|
334 |
|
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); |
|
336 |
|
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); |
|
338 } |
|
339 |
|
340 static force_inline void |
|
341 in_over_2x128 (__m128i* src_lo, |
|
342 __m128i* src_hi, |
|
343 __m128i* alpha_lo, |
|
344 __m128i* alpha_hi, |
|
345 __m128i* mask_lo, |
|
346 __m128i* mask_hi, |
|
347 __m128i* dst_lo, |
|
348 __m128i* dst_hi) |
|
349 { |
|
350 __m128i s_lo, s_hi; |
|
351 __m128i a_lo, a_hi; |
|
352 |
|
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); |
|
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); |
|
355 |
|
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); |
|
357 } |
|
358 |
|
359 /* load 4 pixels from a 16-byte boundary aligned address */ |
|
360 static force_inline __m128i |
|
361 load_128_aligned (__m128i* src) |
|
362 { |
|
363 return _mm_load_si128 (src); |
|
364 } |
|
365 |
|
366 /* load 4 pixels from a unaligned address */ |
|
367 static force_inline __m128i |
|
368 load_128_unaligned (const __m128i* src) |
|
369 { |
|
370 return _mm_loadu_si128 (src); |
|
371 } |
|
372 |
|
373 /* save 4 pixels using Write Combining memory on a 16-byte |
|
374 * boundary aligned address |
|
375 */ |
|
376 static force_inline void |
|
377 save_128_write_combining (__m128i* dst, |
|
378 __m128i data) |
|
379 { |
|
380 _mm_stream_si128 (dst, data); |
|
381 } |
|
382 |
|
383 /* save 4 pixels on a 16-byte boundary aligned address */ |
|
384 static force_inline void |
|
385 save_128_aligned (__m128i* dst, |
|
386 __m128i data) |
|
387 { |
|
388 _mm_store_si128 (dst, data); |
|
389 } |
|
390 |
|
391 /* save 4 pixels on a unaligned address */ |
|
392 static force_inline void |
|
393 save_128_unaligned (__m128i* dst, |
|
394 __m128i data) |
|
395 { |
|
396 _mm_storeu_si128 (dst, data); |
|
397 } |
|
398 |
|
399 static force_inline __m128i |
|
400 load_32_1x128 (uint32_t data) |
|
401 { |
|
402 return _mm_cvtsi32_si128 (data); |
|
403 } |
|
404 |
|
405 static force_inline __m128i |
|
406 expand_alpha_rev_1x128 (__m128i data) |
|
407 { |
|
408 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); |
|
409 } |
|
410 |
|
411 static force_inline __m128i |
|
412 expand_pixel_8_1x128 (uint8_t data) |
|
413 { |
|
414 return _mm_shufflelo_epi16 ( |
|
415 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); |
|
416 } |
|
417 |
|
418 static force_inline __m128i |
|
419 pix_multiply_1x128 (__m128i data, |
|
420 __m128i alpha) |
|
421 { |
|
422 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), |
|
423 mask_0080), |
|
424 mask_0101); |
|
425 } |
|
426 |
|
427 static force_inline __m128i |
|
428 pix_add_multiply_1x128 (__m128i* src, |
|
429 __m128i* alpha_dst, |
|
430 __m128i* dst, |
|
431 __m128i* alpha_src) |
|
432 { |
|
433 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); |
|
434 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); |
|
435 |
|
436 return _mm_adds_epu8 (t1, t2); |
|
437 } |
|
438 |
|
439 static force_inline __m128i |
|
440 negate_1x128 (__m128i data) |
|
441 { |
|
442 return _mm_xor_si128 (data, mask_00ff); |
|
443 } |
|
444 |
|
445 static force_inline __m128i |
|
446 invert_colors_1x128 (__m128i data) |
|
447 { |
|
448 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); |
|
449 } |
|
450 |
|
451 static force_inline __m128i |
|
452 over_1x128 (__m128i src, __m128i alpha, __m128i dst) |
|
453 { |
|
454 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); |
|
455 } |
|
456 |
|
457 static force_inline __m128i |
|
458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) |
|
459 { |
|
460 return over_1x128 (pix_multiply_1x128 (*src, *mask), |
|
461 pix_multiply_1x128 (*alpha, *mask), |
|
462 *dst); |
|
463 } |
|
464 |
|
465 static force_inline __m128i |
|
466 over_rev_non_pre_1x128 (__m128i src, __m128i dst) |
|
467 { |
|
468 __m128i alpha = expand_alpha_1x128 (src); |
|
469 |
|
470 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), |
|
471 _mm_or_si128 (alpha, mask_alpha)), |
|
472 alpha, |
|
473 dst); |
|
474 } |
|
475 |
|
476 static force_inline uint32_t |
|
477 pack_1x128_32 (__m128i data) |
|
478 { |
|
479 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); |
|
480 } |
|
481 |
|
482 static force_inline __m128i |
|
483 expand565_16_1x128 (uint16_t pixel) |
|
484 { |
|
485 __m128i m = _mm_cvtsi32_si128 (pixel); |
|
486 |
|
487 m = unpack_565_to_8888 (m); |
|
488 |
|
489 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); |
|
490 } |
|
491 |
|
492 static force_inline uint32_t |
|
493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) |
|
494 { |
|
495 uint8_t a; |
|
496 __m128i xmms; |
|
497 |
|
498 a = src >> 24; |
|
499 |
|
500 if (a == 0xff) |
|
501 { |
|
502 return src; |
|
503 } |
|
504 else if (src) |
|
505 { |
|
506 xmms = unpack_32_1x128 (src); |
|
507 return pack_1x128_32 ( |
|
508 over_1x128 (xmms, expand_alpha_1x128 (xmms), |
|
509 unpack_32_1x128 (dst))); |
|
510 } |
|
511 |
|
512 return dst; |
|
513 } |
|
514 |
|
515 static force_inline uint32_t |
|
516 combine1 (const uint32_t *ps, const uint32_t *pm) |
|
517 { |
|
518 uint32_t s = *ps; |
|
519 |
|
520 if (pm) |
|
521 { |
|
522 __m128i ms, mm; |
|
523 |
|
524 mm = unpack_32_1x128 (*pm); |
|
525 mm = expand_alpha_1x128 (mm); |
|
526 |
|
527 ms = unpack_32_1x128 (s); |
|
528 ms = pix_multiply_1x128 (ms, mm); |
|
529 |
|
530 s = pack_1x128_32 (ms); |
|
531 } |
|
532 |
|
533 return s; |
|
534 } |
|
535 |
|
536 static force_inline __m128i |
|
537 combine4 (const __m128i *ps, const __m128i *pm) |
|
538 { |
|
539 __m128i xmm_src_lo, xmm_src_hi; |
|
540 __m128i xmm_msk_lo, xmm_msk_hi; |
|
541 __m128i s; |
|
542 |
|
543 if (pm) |
|
544 { |
|
545 xmm_msk_lo = load_128_unaligned (pm); |
|
546 |
|
547 if (is_transparent (xmm_msk_lo)) |
|
548 return _mm_setzero_si128 (); |
|
549 } |
|
550 |
|
551 s = load_128_unaligned (ps); |
|
552 |
|
553 if (pm) |
|
554 { |
|
555 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); |
|
556 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); |
|
557 |
|
558 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); |
|
559 |
|
560 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
561 &xmm_msk_lo, &xmm_msk_hi, |
|
562 &xmm_src_lo, &xmm_src_hi); |
|
563 |
|
564 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); |
|
565 } |
|
566 |
|
567 return s; |
|
568 } |
|
569 |
|
570 static force_inline void |
|
571 core_combine_over_u_sse2_mask (uint32_t * pd, |
|
572 const uint32_t* ps, |
|
573 const uint32_t* pm, |
|
574 int w) |
|
575 { |
|
576 uint32_t s, d; |
|
577 |
|
578 /* Align dst on a 16-byte boundary */ |
|
579 while (w && ((uintptr_t)pd & 15)) |
|
580 { |
|
581 d = *pd; |
|
582 s = combine1 (ps, pm); |
|
583 |
|
584 if (s) |
|
585 *pd = core_combine_over_u_pixel_sse2 (s, d); |
|
586 pd++; |
|
587 ps++; |
|
588 pm++; |
|
589 w--; |
|
590 } |
|
591 |
|
592 while (w >= 4) |
|
593 { |
|
594 __m128i mask = load_128_unaligned ((__m128i *)pm); |
|
595 |
|
596 if (!is_zero (mask)) |
|
597 { |
|
598 __m128i src; |
|
599 __m128i src_hi, src_lo; |
|
600 __m128i mask_hi, mask_lo; |
|
601 __m128i alpha_hi, alpha_lo; |
|
602 |
|
603 src = load_128_unaligned ((__m128i *)ps); |
|
604 |
|
605 if (is_opaque (_mm_and_si128 (src, mask))) |
|
606 { |
|
607 save_128_aligned ((__m128i *)pd, src); |
|
608 } |
|
609 else |
|
610 { |
|
611 __m128i dst = load_128_aligned ((__m128i *)pd); |
|
612 __m128i dst_hi, dst_lo; |
|
613 |
|
614 unpack_128_2x128 (mask, &mask_lo, &mask_hi); |
|
615 unpack_128_2x128 (src, &src_lo, &src_hi); |
|
616 |
|
617 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); |
|
618 pix_multiply_2x128 (&src_lo, &src_hi, |
|
619 &mask_lo, &mask_hi, |
|
620 &src_lo, &src_hi); |
|
621 |
|
622 unpack_128_2x128 (dst, &dst_lo, &dst_hi); |
|
623 |
|
624 expand_alpha_2x128 (src_lo, src_hi, |
|
625 &alpha_lo, &alpha_hi); |
|
626 |
|
627 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, |
|
628 &dst_lo, &dst_hi); |
|
629 |
|
630 save_128_aligned ( |
|
631 (__m128i *)pd, |
|
632 pack_2x128_128 (dst_lo, dst_hi)); |
|
633 } |
|
634 } |
|
635 |
|
636 pm += 4; |
|
637 ps += 4; |
|
638 pd += 4; |
|
639 w -= 4; |
|
640 } |
|
641 while (w) |
|
642 { |
|
643 d = *pd; |
|
644 s = combine1 (ps, pm); |
|
645 |
|
646 if (s) |
|
647 *pd = core_combine_over_u_pixel_sse2 (s, d); |
|
648 pd++; |
|
649 ps++; |
|
650 pm++; |
|
651 |
|
652 w--; |
|
653 } |
|
654 } |
|
655 |
|
656 static force_inline void |
|
657 core_combine_over_u_sse2_no_mask (uint32_t * pd, |
|
658 const uint32_t* ps, |
|
659 int w) |
|
660 { |
|
661 uint32_t s, d; |
|
662 |
|
663 /* Align dst on a 16-byte boundary */ |
|
664 while (w && ((uintptr_t)pd & 15)) |
|
665 { |
|
666 d = *pd; |
|
667 s = *ps; |
|
668 |
|
669 if (s) |
|
670 *pd = core_combine_over_u_pixel_sse2 (s, d); |
|
671 pd++; |
|
672 ps++; |
|
673 w--; |
|
674 } |
|
675 |
|
676 while (w >= 4) |
|
677 { |
|
678 __m128i src; |
|
679 __m128i src_hi, src_lo, dst_hi, dst_lo; |
|
680 __m128i alpha_hi, alpha_lo; |
|
681 |
|
682 src = load_128_unaligned ((__m128i *)ps); |
|
683 |
|
684 if (!is_zero (src)) |
|
685 { |
|
686 if (is_opaque (src)) |
|
687 { |
|
688 save_128_aligned ((__m128i *)pd, src); |
|
689 } |
|
690 else |
|
691 { |
|
692 __m128i dst = load_128_aligned ((__m128i *)pd); |
|
693 |
|
694 unpack_128_2x128 (src, &src_lo, &src_hi); |
|
695 unpack_128_2x128 (dst, &dst_lo, &dst_hi); |
|
696 |
|
697 expand_alpha_2x128 (src_lo, src_hi, |
|
698 &alpha_lo, &alpha_hi); |
|
699 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, |
|
700 &dst_lo, &dst_hi); |
|
701 |
|
702 save_128_aligned ( |
|
703 (__m128i *)pd, |
|
704 pack_2x128_128 (dst_lo, dst_hi)); |
|
705 } |
|
706 } |
|
707 |
|
708 ps += 4; |
|
709 pd += 4; |
|
710 w -= 4; |
|
711 } |
|
712 while (w) |
|
713 { |
|
714 d = *pd; |
|
715 s = *ps; |
|
716 |
|
717 if (s) |
|
718 *pd = core_combine_over_u_pixel_sse2 (s, d); |
|
719 pd++; |
|
720 ps++; |
|
721 |
|
722 w--; |
|
723 } |
|
724 } |
|
725 |
|
726 static force_inline void |
|
727 sse2_combine_over_u (pixman_implementation_t *imp, |
|
728 pixman_op_t op, |
|
729 uint32_t * pd, |
|
730 const uint32_t * ps, |
|
731 const uint32_t * pm, |
|
732 int w) |
|
733 { |
|
734 if (pm) |
|
735 core_combine_over_u_sse2_mask (pd, ps, pm, w); |
|
736 else |
|
737 core_combine_over_u_sse2_no_mask (pd, ps, w); |
|
738 } |
|
739 |
|
740 static void |
|
741 sse2_combine_over_reverse_u (pixman_implementation_t *imp, |
|
742 pixman_op_t op, |
|
743 uint32_t * pd, |
|
744 const uint32_t * ps, |
|
745 const uint32_t * pm, |
|
746 int w) |
|
747 { |
|
748 uint32_t s, d; |
|
749 |
|
750 __m128i xmm_dst_lo, xmm_dst_hi; |
|
751 __m128i xmm_src_lo, xmm_src_hi; |
|
752 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
753 |
|
754 /* Align dst on a 16-byte boundary */ |
|
755 while (w && |
|
756 ((uintptr_t)pd & 15)) |
|
757 { |
|
758 d = *pd; |
|
759 s = combine1 (ps, pm); |
|
760 |
|
761 *pd++ = core_combine_over_u_pixel_sse2 (d, s); |
|
762 w--; |
|
763 ps++; |
|
764 if (pm) |
|
765 pm++; |
|
766 } |
|
767 |
|
768 while (w >= 4) |
|
769 { |
|
770 /* I'm loading unaligned because I'm not sure |
|
771 * about the address alignment. |
|
772 */ |
|
773 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
|
774 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
775 |
|
776 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
777 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
778 |
|
779 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
780 &xmm_alpha_lo, &xmm_alpha_hi); |
|
781 |
|
782 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
783 &xmm_alpha_lo, &xmm_alpha_hi, |
|
784 &xmm_src_lo, &xmm_src_hi); |
|
785 |
|
786 /* rebuid the 4 pixel data and save*/ |
|
787 save_128_aligned ((__m128i*)pd, |
|
788 pack_2x128_128 (xmm_src_lo, xmm_src_hi)); |
|
789 |
|
790 w -= 4; |
|
791 ps += 4; |
|
792 pd += 4; |
|
793 |
|
794 if (pm) |
|
795 pm += 4; |
|
796 } |
|
797 |
|
798 while (w) |
|
799 { |
|
800 d = *pd; |
|
801 s = combine1 (ps, pm); |
|
802 |
|
803 *pd++ = core_combine_over_u_pixel_sse2 (d, s); |
|
804 ps++; |
|
805 w--; |
|
806 if (pm) |
|
807 pm++; |
|
808 } |
|
809 } |
|
810 |
|
811 static force_inline uint32_t |
|
812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) |
|
813 { |
|
814 uint32_t maska = src >> 24; |
|
815 |
|
816 if (maska == 0) |
|
817 { |
|
818 return 0; |
|
819 } |
|
820 else if (maska != 0xff) |
|
821 { |
|
822 return pack_1x128_32 ( |
|
823 pix_multiply_1x128 (unpack_32_1x128 (dst), |
|
824 expand_alpha_1x128 (unpack_32_1x128 (src)))); |
|
825 } |
|
826 |
|
827 return dst; |
|
828 } |
|
829 |
|
830 static void |
|
831 sse2_combine_in_u (pixman_implementation_t *imp, |
|
832 pixman_op_t op, |
|
833 uint32_t * pd, |
|
834 const uint32_t * ps, |
|
835 const uint32_t * pm, |
|
836 int w) |
|
837 { |
|
838 uint32_t s, d; |
|
839 |
|
840 __m128i xmm_src_lo, xmm_src_hi; |
|
841 __m128i xmm_dst_lo, xmm_dst_hi; |
|
842 |
|
843 while (w && ((uintptr_t)pd & 15)) |
|
844 { |
|
845 s = combine1 (ps, pm); |
|
846 d = *pd; |
|
847 |
|
848 *pd++ = core_combine_in_u_pixel_sse2 (d, s); |
|
849 w--; |
|
850 ps++; |
|
851 if (pm) |
|
852 pm++; |
|
853 } |
|
854 |
|
855 while (w >= 4) |
|
856 { |
|
857 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
858 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); |
|
859 |
|
860 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
861 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
862 |
|
863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
865 &xmm_dst_lo, &xmm_dst_hi, |
|
866 &xmm_dst_lo, &xmm_dst_hi); |
|
867 |
|
868 save_128_aligned ((__m128i*)pd, |
|
869 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
870 |
|
871 ps += 4; |
|
872 pd += 4; |
|
873 w -= 4; |
|
874 if (pm) |
|
875 pm += 4; |
|
876 } |
|
877 |
|
878 while (w) |
|
879 { |
|
880 s = combine1 (ps, pm); |
|
881 d = *pd; |
|
882 |
|
883 *pd++ = core_combine_in_u_pixel_sse2 (d, s); |
|
884 w--; |
|
885 ps++; |
|
886 if (pm) |
|
887 pm++; |
|
888 } |
|
889 } |
|
890 |
|
891 static void |
|
892 sse2_combine_in_reverse_u (pixman_implementation_t *imp, |
|
893 pixman_op_t op, |
|
894 uint32_t * pd, |
|
895 const uint32_t * ps, |
|
896 const uint32_t * pm, |
|
897 int w) |
|
898 { |
|
899 uint32_t s, d; |
|
900 |
|
901 __m128i xmm_src_lo, xmm_src_hi; |
|
902 __m128i xmm_dst_lo, xmm_dst_hi; |
|
903 |
|
904 while (w && ((uintptr_t)pd & 15)) |
|
905 { |
|
906 s = combine1 (ps, pm); |
|
907 d = *pd; |
|
908 |
|
909 *pd++ = core_combine_in_u_pixel_sse2 (s, d); |
|
910 ps++; |
|
911 w--; |
|
912 if (pm) |
|
913 pm++; |
|
914 } |
|
915 |
|
916 while (w >= 4) |
|
917 { |
|
918 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
919 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); |
|
920 |
|
921 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
922 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
923 |
|
924 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
925 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
926 &xmm_src_lo, &xmm_src_hi, |
|
927 &xmm_dst_lo, &xmm_dst_hi); |
|
928 |
|
929 save_128_aligned ( |
|
930 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
931 |
|
932 ps += 4; |
|
933 pd += 4; |
|
934 w -= 4; |
|
935 if (pm) |
|
936 pm += 4; |
|
937 } |
|
938 |
|
939 while (w) |
|
940 { |
|
941 s = combine1 (ps, pm); |
|
942 d = *pd; |
|
943 |
|
944 *pd++ = core_combine_in_u_pixel_sse2 (s, d); |
|
945 w--; |
|
946 ps++; |
|
947 if (pm) |
|
948 pm++; |
|
949 } |
|
950 } |
|
951 |
|
952 static void |
|
953 sse2_combine_out_reverse_u (pixman_implementation_t *imp, |
|
954 pixman_op_t op, |
|
955 uint32_t * pd, |
|
956 const uint32_t * ps, |
|
957 const uint32_t * pm, |
|
958 int w) |
|
959 { |
|
960 while (w && ((uintptr_t)pd & 15)) |
|
961 { |
|
962 uint32_t s = combine1 (ps, pm); |
|
963 uint32_t d = *pd; |
|
964 |
|
965 *pd++ = pack_1x128_32 ( |
|
966 pix_multiply_1x128 ( |
|
967 unpack_32_1x128 (d), negate_1x128 ( |
|
968 expand_alpha_1x128 (unpack_32_1x128 (s))))); |
|
969 |
|
970 if (pm) |
|
971 pm++; |
|
972 ps++; |
|
973 w--; |
|
974 } |
|
975 |
|
976 while (w >= 4) |
|
977 { |
|
978 __m128i xmm_src_lo, xmm_src_hi; |
|
979 __m128i xmm_dst_lo, xmm_dst_hi; |
|
980 |
|
981 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
|
982 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
983 |
|
984 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
985 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
986 |
|
987 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
988 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
989 |
|
990 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
991 &xmm_src_lo, &xmm_src_hi, |
|
992 &xmm_dst_lo, &xmm_dst_hi); |
|
993 |
|
994 save_128_aligned ( |
|
995 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
996 |
|
997 ps += 4; |
|
998 pd += 4; |
|
999 if (pm) |
|
1000 pm += 4; |
|
1001 |
|
1002 w -= 4; |
|
1003 } |
|
1004 |
|
1005 while (w) |
|
1006 { |
|
1007 uint32_t s = combine1 (ps, pm); |
|
1008 uint32_t d = *pd; |
|
1009 |
|
1010 *pd++ = pack_1x128_32 ( |
|
1011 pix_multiply_1x128 ( |
|
1012 unpack_32_1x128 (d), negate_1x128 ( |
|
1013 expand_alpha_1x128 (unpack_32_1x128 (s))))); |
|
1014 ps++; |
|
1015 if (pm) |
|
1016 pm++; |
|
1017 w--; |
|
1018 } |
|
1019 } |
|
1020 |
|
1021 static void |
|
1022 sse2_combine_out_u (pixman_implementation_t *imp, |
|
1023 pixman_op_t op, |
|
1024 uint32_t * pd, |
|
1025 const uint32_t * ps, |
|
1026 const uint32_t * pm, |
|
1027 int w) |
|
1028 { |
|
1029 while (w && ((uintptr_t)pd & 15)) |
|
1030 { |
|
1031 uint32_t s = combine1 (ps, pm); |
|
1032 uint32_t d = *pd; |
|
1033 |
|
1034 *pd++ = pack_1x128_32 ( |
|
1035 pix_multiply_1x128 ( |
|
1036 unpack_32_1x128 (s), negate_1x128 ( |
|
1037 expand_alpha_1x128 (unpack_32_1x128 (d))))); |
|
1038 w--; |
|
1039 ps++; |
|
1040 if (pm) |
|
1041 pm++; |
|
1042 } |
|
1043 |
|
1044 while (w >= 4) |
|
1045 { |
|
1046 __m128i xmm_src_lo, xmm_src_hi; |
|
1047 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1048 |
|
1049 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); |
|
1050 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
1051 |
|
1052 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1054 |
|
1055 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1056 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1057 |
|
1058 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
1059 &xmm_dst_lo, &xmm_dst_hi, |
|
1060 &xmm_dst_lo, &xmm_dst_hi); |
|
1061 |
|
1062 save_128_aligned ( |
|
1063 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1064 |
|
1065 ps += 4; |
|
1066 pd += 4; |
|
1067 w -= 4; |
|
1068 if (pm) |
|
1069 pm += 4; |
|
1070 } |
|
1071 |
|
1072 while (w) |
|
1073 { |
|
1074 uint32_t s = combine1 (ps, pm); |
|
1075 uint32_t d = *pd; |
|
1076 |
|
1077 *pd++ = pack_1x128_32 ( |
|
1078 pix_multiply_1x128 ( |
|
1079 unpack_32_1x128 (s), negate_1x128 ( |
|
1080 expand_alpha_1x128 (unpack_32_1x128 (d))))); |
|
1081 w--; |
|
1082 ps++; |
|
1083 if (pm) |
|
1084 pm++; |
|
1085 } |
|
1086 } |
|
1087 |
|
1088 static force_inline uint32_t |
|
1089 core_combine_atop_u_pixel_sse2 (uint32_t src, |
|
1090 uint32_t dst) |
|
1091 { |
|
1092 __m128i s = unpack_32_1x128 (src); |
|
1093 __m128i d = unpack_32_1x128 (dst); |
|
1094 |
|
1095 __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); |
|
1096 __m128i da = expand_alpha_1x128 (d); |
|
1097 |
|
1098 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); |
|
1099 } |
|
1100 |
|
1101 static void |
|
1102 sse2_combine_atop_u (pixman_implementation_t *imp, |
|
1103 pixman_op_t op, |
|
1104 uint32_t * pd, |
|
1105 const uint32_t * ps, |
|
1106 const uint32_t * pm, |
|
1107 int w) |
|
1108 { |
|
1109 uint32_t s, d; |
|
1110 |
|
1111 __m128i xmm_src_lo, xmm_src_hi; |
|
1112 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1113 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
|
1114 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
|
1115 |
|
1116 while (w && ((uintptr_t)pd & 15)) |
|
1117 { |
|
1118 s = combine1 (ps, pm); |
|
1119 d = *pd; |
|
1120 |
|
1121 *pd++ = core_combine_atop_u_pixel_sse2 (s, d); |
|
1122 w--; |
|
1123 ps++; |
|
1124 if (pm) |
|
1125 pm++; |
|
1126 } |
|
1127 |
|
1128 while (w >= 4) |
|
1129 { |
|
1130 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
|
1131 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
1132 |
|
1133 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1135 |
|
1136 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
1137 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
1138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
1139 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
1140 |
|
1141 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, |
|
1142 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
1143 |
|
1144 pix_add_multiply_2x128 ( |
|
1145 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
|
1146 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
|
1147 &xmm_dst_lo, &xmm_dst_hi); |
|
1148 |
|
1149 save_128_aligned ( |
|
1150 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1151 |
|
1152 ps += 4; |
|
1153 pd += 4; |
|
1154 w -= 4; |
|
1155 if (pm) |
|
1156 pm += 4; |
|
1157 } |
|
1158 |
|
1159 while (w) |
|
1160 { |
|
1161 s = combine1 (ps, pm); |
|
1162 d = *pd; |
|
1163 |
|
1164 *pd++ = core_combine_atop_u_pixel_sse2 (s, d); |
|
1165 w--; |
|
1166 ps++; |
|
1167 if (pm) |
|
1168 pm++; |
|
1169 } |
|
1170 } |
|
1171 |
|
1172 static force_inline uint32_t |
|
1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, |
|
1174 uint32_t dst) |
|
1175 { |
|
1176 __m128i s = unpack_32_1x128 (src); |
|
1177 __m128i d = unpack_32_1x128 (dst); |
|
1178 |
|
1179 __m128i sa = expand_alpha_1x128 (s); |
|
1180 __m128i da = negate_1x128 (expand_alpha_1x128 (d)); |
|
1181 |
|
1182 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); |
|
1183 } |
|
1184 |
|
1185 static void |
|
1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp, |
|
1187 pixman_op_t op, |
|
1188 uint32_t * pd, |
|
1189 const uint32_t * ps, |
|
1190 const uint32_t * pm, |
|
1191 int w) |
|
1192 { |
|
1193 uint32_t s, d; |
|
1194 |
|
1195 __m128i xmm_src_lo, xmm_src_hi; |
|
1196 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1197 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
|
1198 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
|
1199 |
|
1200 while (w && ((uintptr_t)pd & 15)) |
|
1201 { |
|
1202 s = combine1 (ps, pm); |
|
1203 d = *pd; |
|
1204 |
|
1205 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); |
|
1206 ps++; |
|
1207 w--; |
|
1208 if (pm) |
|
1209 pm++; |
|
1210 } |
|
1211 |
|
1212 while (w >= 4) |
|
1213 { |
|
1214 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
|
1215 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
1216 |
|
1217 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1218 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1219 |
|
1220 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
1221 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
1222 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
1223 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
1224 |
|
1225 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
|
1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
1227 |
|
1228 pix_add_multiply_2x128 ( |
|
1229 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
|
1230 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
|
1231 &xmm_dst_lo, &xmm_dst_hi); |
|
1232 |
|
1233 save_128_aligned ( |
|
1234 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1235 |
|
1236 ps += 4; |
|
1237 pd += 4; |
|
1238 w -= 4; |
|
1239 if (pm) |
|
1240 pm += 4; |
|
1241 } |
|
1242 |
|
1243 while (w) |
|
1244 { |
|
1245 s = combine1 (ps, pm); |
|
1246 d = *pd; |
|
1247 |
|
1248 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); |
|
1249 ps++; |
|
1250 w--; |
|
1251 if (pm) |
|
1252 pm++; |
|
1253 } |
|
1254 } |
|
1255 |
|
1256 static force_inline uint32_t |
|
1257 core_combine_xor_u_pixel_sse2 (uint32_t src, |
|
1258 uint32_t dst) |
|
1259 { |
|
1260 __m128i s = unpack_32_1x128 (src); |
|
1261 __m128i d = unpack_32_1x128 (dst); |
|
1262 |
|
1263 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); |
|
1264 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); |
|
1265 |
|
1266 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); |
|
1267 } |
|
1268 |
|
1269 static void |
|
1270 sse2_combine_xor_u (pixman_implementation_t *imp, |
|
1271 pixman_op_t op, |
|
1272 uint32_t * dst, |
|
1273 const uint32_t * src, |
|
1274 const uint32_t * mask, |
|
1275 int width) |
|
1276 { |
|
1277 int w = width; |
|
1278 uint32_t s, d; |
|
1279 uint32_t* pd = dst; |
|
1280 const uint32_t* ps = src; |
|
1281 const uint32_t* pm = mask; |
|
1282 |
|
1283 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
1284 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
1285 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
|
1286 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
|
1287 |
|
1288 while (w && ((uintptr_t)pd & 15)) |
|
1289 { |
|
1290 s = combine1 (ps, pm); |
|
1291 d = *pd; |
|
1292 |
|
1293 *pd++ = core_combine_xor_u_pixel_sse2 (s, d); |
|
1294 w--; |
|
1295 ps++; |
|
1296 if (pm) |
|
1297 pm++; |
|
1298 } |
|
1299 |
|
1300 while (w >= 4) |
|
1301 { |
|
1302 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); |
|
1303 xmm_dst = load_128_aligned ((__m128i*) pd); |
|
1304 |
|
1305 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
1306 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
1307 |
|
1308 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
1309 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
1310 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
1311 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
1312 |
|
1313 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, |
|
1314 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
1315 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
|
1316 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
1317 |
|
1318 pix_add_multiply_2x128 ( |
|
1319 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
|
1320 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
|
1321 &xmm_dst_lo, &xmm_dst_hi); |
|
1322 |
|
1323 save_128_aligned ( |
|
1324 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1325 |
|
1326 ps += 4; |
|
1327 pd += 4; |
|
1328 w -= 4; |
|
1329 if (pm) |
|
1330 pm += 4; |
|
1331 } |
|
1332 |
|
1333 while (w) |
|
1334 { |
|
1335 s = combine1 (ps, pm); |
|
1336 d = *pd; |
|
1337 |
|
1338 *pd++ = core_combine_xor_u_pixel_sse2 (s, d); |
|
1339 w--; |
|
1340 ps++; |
|
1341 if (pm) |
|
1342 pm++; |
|
1343 } |
|
1344 } |
|
1345 |
|
1346 static force_inline void |
|
1347 sse2_combine_add_u (pixman_implementation_t *imp, |
|
1348 pixman_op_t op, |
|
1349 uint32_t * dst, |
|
1350 const uint32_t * src, |
|
1351 const uint32_t * mask, |
|
1352 int width) |
|
1353 { |
|
1354 int w = width; |
|
1355 uint32_t s, d; |
|
1356 uint32_t* pd = dst; |
|
1357 const uint32_t* ps = src; |
|
1358 const uint32_t* pm = mask; |
|
1359 |
|
1360 while (w && (uintptr_t)pd & 15) |
|
1361 { |
|
1362 s = combine1 (ps, pm); |
|
1363 d = *pd; |
|
1364 |
|
1365 ps++; |
|
1366 if (pm) |
|
1367 pm++; |
|
1368 *pd++ = _mm_cvtsi128_si32 ( |
|
1369 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); |
|
1370 w--; |
|
1371 } |
|
1372 |
|
1373 while (w >= 4) |
|
1374 { |
|
1375 __m128i s; |
|
1376 |
|
1377 s = combine4 ((__m128i*)ps, (__m128i*)pm); |
|
1378 |
|
1379 save_128_aligned ( |
|
1380 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); |
|
1381 |
|
1382 pd += 4; |
|
1383 ps += 4; |
|
1384 if (pm) |
|
1385 pm += 4; |
|
1386 w -= 4; |
|
1387 } |
|
1388 |
|
1389 while (w--) |
|
1390 { |
|
1391 s = combine1 (ps, pm); |
|
1392 d = *pd; |
|
1393 |
|
1394 ps++; |
|
1395 *pd++ = _mm_cvtsi128_si32 ( |
|
1396 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); |
|
1397 if (pm) |
|
1398 pm++; |
|
1399 } |
|
1400 } |
|
1401 |
|
1402 static force_inline uint32_t |
|
1403 core_combine_saturate_u_pixel_sse2 (uint32_t src, |
|
1404 uint32_t dst) |
|
1405 { |
|
1406 __m128i ms = unpack_32_1x128 (src); |
|
1407 __m128i md = unpack_32_1x128 (dst); |
|
1408 uint32_t sa = src >> 24; |
|
1409 uint32_t da = ~dst >> 24; |
|
1410 |
|
1411 if (sa > da) |
|
1412 { |
|
1413 ms = pix_multiply_1x128 ( |
|
1414 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); |
|
1415 } |
|
1416 |
|
1417 return pack_1x128_32 (_mm_adds_epu16 (md, ms)); |
|
1418 } |
|
1419 |
|
1420 static void |
|
1421 sse2_combine_saturate_u (pixman_implementation_t *imp, |
|
1422 pixman_op_t op, |
|
1423 uint32_t * pd, |
|
1424 const uint32_t * ps, |
|
1425 const uint32_t * pm, |
|
1426 int w) |
|
1427 { |
|
1428 uint32_t s, d; |
|
1429 |
|
1430 uint32_t pack_cmp; |
|
1431 __m128i xmm_src, xmm_dst; |
|
1432 |
|
1433 while (w && (uintptr_t)pd & 15) |
|
1434 { |
|
1435 s = combine1 (ps, pm); |
|
1436 d = *pd; |
|
1437 |
|
1438 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
|
1439 w--; |
|
1440 ps++; |
|
1441 if (pm) |
|
1442 pm++; |
|
1443 } |
|
1444 |
|
1445 while (w >= 4) |
|
1446 { |
|
1447 xmm_dst = load_128_aligned ((__m128i*)pd); |
|
1448 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); |
|
1449 |
|
1450 pack_cmp = _mm_movemask_epi8 ( |
|
1451 _mm_cmpgt_epi32 ( |
|
1452 _mm_srli_epi32 (xmm_src, 24), |
|
1453 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); |
|
1454 |
|
1455 /* if some alpha src is grater than respective ~alpha dst */ |
|
1456 if (pack_cmp) |
|
1457 { |
|
1458 s = combine1 (ps++, pm); |
|
1459 d = *pd; |
|
1460 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
|
1461 if (pm) |
|
1462 pm++; |
|
1463 |
|
1464 s = combine1 (ps++, pm); |
|
1465 d = *pd; |
|
1466 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
|
1467 if (pm) |
|
1468 pm++; |
|
1469 |
|
1470 s = combine1 (ps++, pm); |
|
1471 d = *pd; |
|
1472 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
|
1473 if (pm) |
|
1474 pm++; |
|
1475 |
|
1476 s = combine1 (ps++, pm); |
|
1477 d = *pd; |
|
1478 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
|
1479 if (pm) |
|
1480 pm++; |
|
1481 } |
|
1482 else |
|
1483 { |
|
1484 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); |
|
1485 |
|
1486 pd += 4; |
|
1487 ps += 4; |
|
1488 if (pm) |
|
1489 pm += 4; |
|
1490 } |
|
1491 |
|
1492 w -= 4; |
|
1493 } |
|
1494 |
|
1495 while (w--) |
|
1496 { |
|
1497 s = combine1 (ps, pm); |
|
1498 d = *pd; |
|
1499 |
|
1500 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
|
1501 ps++; |
|
1502 if (pm) |
|
1503 pm++; |
|
1504 } |
|
1505 } |
|
1506 |
|
1507 static void |
|
1508 sse2_combine_src_ca (pixman_implementation_t *imp, |
|
1509 pixman_op_t op, |
|
1510 uint32_t * pd, |
|
1511 const uint32_t * ps, |
|
1512 const uint32_t * pm, |
|
1513 int w) |
|
1514 { |
|
1515 uint32_t s, m; |
|
1516 |
|
1517 __m128i xmm_src_lo, xmm_src_hi; |
|
1518 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1519 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1520 |
|
1521 while (w && (uintptr_t)pd & 15) |
|
1522 { |
|
1523 s = *ps++; |
|
1524 m = *pm++; |
|
1525 *pd++ = pack_1x128_32 ( |
|
1526 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); |
|
1527 w--; |
|
1528 } |
|
1529 |
|
1530 while (w >= 4) |
|
1531 { |
|
1532 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1533 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1534 |
|
1535 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1536 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1537 |
|
1538 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
1539 &xmm_mask_lo, &xmm_mask_hi, |
|
1540 &xmm_dst_lo, &xmm_dst_hi); |
|
1541 |
|
1542 save_128_aligned ( |
|
1543 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1544 |
|
1545 ps += 4; |
|
1546 pd += 4; |
|
1547 pm += 4; |
|
1548 w -= 4; |
|
1549 } |
|
1550 |
|
1551 while (w) |
|
1552 { |
|
1553 s = *ps++; |
|
1554 m = *pm++; |
|
1555 *pd++ = pack_1x128_32 ( |
|
1556 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); |
|
1557 w--; |
|
1558 } |
|
1559 } |
|
1560 |
|
1561 static force_inline uint32_t |
|
1562 core_combine_over_ca_pixel_sse2 (uint32_t src, |
|
1563 uint32_t mask, |
|
1564 uint32_t dst) |
|
1565 { |
|
1566 __m128i s = unpack_32_1x128 (src); |
|
1567 __m128i expAlpha = expand_alpha_1x128 (s); |
|
1568 __m128i unpk_mask = unpack_32_1x128 (mask); |
|
1569 __m128i unpk_dst = unpack_32_1x128 (dst); |
|
1570 |
|
1571 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); |
|
1572 } |
|
1573 |
|
1574 static void |
|
1575 sse2_combine_over_ca (pixman_implementation_t *imp, |
|
1576 pixman_op_t op, |
|
1577 uint32_t * pd, |
|
1578 const uint32_t * ps, |
|
1579 const uint32_t * pm, |
|
1580 int w) |
|
1581 { |
|
1582 uint32_t s, m, d; |
|
1583 |
|
1584 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
1585 __m128i xmm_src_lo, xmm_src_hi; |
|
1586 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1587 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1588 |
|
1589 while (w && (uintptr_t)pd & 15) |
|
1590 { |
|
1591 s = *ps++; |
|
1592 m = *pm++; |
|
1593 d = *pd; |
|
1594 |
|
1595 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); |
|
1596 w--; |
|
1597 } |
|
1598 |
|
1599 while (w >= 4) |
|
1600 { |
|
1601 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
1602 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1603 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1604 |
|
1605 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1606 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1607 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1608 |
|
1609 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
1610 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1611 |
|
1612 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
1613 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1614 &xmm_mask_lo, &xmm_mask_hi, |
|
1615 &xmm_dst_lo, &xmm_dst_hi); |
|
1616 |
|
1617 save_128_aligned ( |
|
1618 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1619 |
|
1620 ps += 4; |
|
1621 pd += 4; |
|
1622 pm += 4; |
|
1623 w -= 4; |
|
1624 } |
|
1625 |
|
1626 while (w) |
|
1627 { |
|
1628 s = *ps++; |
|
1629 m = *pm++; |
|
1630 d = *pd; |
|
1631 |
|
1632 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); |
|
1633 w--; |
|
1634 } |
|
1635 } |
|
1636 |
|
1637 static force_inline uint32_t |
|
1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, |
|
1639 uint32_t mask, |
|
1640 uint32_t dst) |
|
1641 { |
|
1642 __m128i d = unpack_32_1x128 (dst); |
|
1643 |
|
1644 return pack_1x128_32 ( |
|
1645 over_1x128 (d, expand_alpha_1x128 (d), |
|
1646 pix_multiply_1x128 (unpack_32_1x128 (src), |
|
1647 unpack_32_1x128 (mask)))); |
|
1648 } |
|
1649 |
|
1650 static void |
|
1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp, |
|
1652 pixman_op_t op, |
|
1653 uint32_t * pd, |
|
1654 const uint32_t * ps, |
|
1655 const uint32_t * pm, |
|
1656 int w) |
|
1657 { |
|
1658 uint32_t s, m, d; |
|
1659 |
|
1660 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
1661 __m128i xmm_src_lo, xmm_src_hi; |
|
1662 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1663 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1664 |
|
1665 while (w && (uintptr_t)pd & 15) |
|
1666 { |
|
1667 s = *ps++; |
|
1668 m = *pm++; |
|
1669 d = *pd; |
|
1670 |
|
1671 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); |
|
1672 w--; |
|
1673 } |
|
1674 |
|
1675 while (w >= 4) |
|
1676 { |
|
1677 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
1678 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1679 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1680 |
|
1681 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1682 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1683 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1684 |
|
1685 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
1686 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1687 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
1688 &xmm_mask_lo, &xmm_mask_hi, |
|
1689 &xmm_mask_lo, &xmm_mask_hi); |
|
1690 |
|
1691 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
1692 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1693 &xmm_mask_lo, &xmm_mask_hi); |
|
1694 |
|
1695 save_128_aligned ( |
|
1696 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); |
|
1697 |
|
1698 ps += 4; |
|
1699 pd += 4; |
|
1700 pm += 4; |
|
1701 w -= 4; |
|
1702 } |
|
1703 |
|
1704 while (w) |
|
1705 { |
|
1706 s = *ps++; |
|
1707 m = *pm++; |
|
1708 d = *pd; |
|
1709 |
|
1710 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); |
|
1711 w--; |
|
1712 } |
|
1713 } |
|
1714 |
|
1715 static void |
|
1716 sse2_combine_in_ca (pixman_implementation_t *imp, |
|
1717 pixman_op_t op, |
|
1718 uint32_t * pd, |
|
1719 const uint32_t * ps, |
|
1720 const uint32_t * pm, |
|
1721 int w) |
|
1722 { |
|
1723 uint32_t s, m, d; |
|
1724 |
|
1725 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
1726 __m128i xmm_src_lo, xmm_src_hi; |
|
1727 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1728 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1729 |
|
1730 while (w && (uintptr_t)pd & 15) |
|
1731 { |
|
1732 s = *ps++; |
|
1733 m = *pm++; |
|
1734 d = *pd; |
|
1735 |
|
1736 *pd++ = pack_1x128_32 ( |
|
1737 pix_multiply_1x128 ( |
|
1738 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), |
|
1739 expand_alpha_1x128 (unpack_32_1x128 (d)))); |
|
1740 |
|
1741 w--; |
|
1742 } |
|
1743 |
|
1744 while (w >= 4) |
|
1745 { |
|
1746 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
1747 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1748 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1749 |
|
1750 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1751 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1752 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1753 |
|
1754 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
1755 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1756 |
|
1757 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
1758 &xmm_mask_lo, &xmm_mask_hi, |
|
1759 &xmm_dst_lo, &xmm_dst_hi); |
|
1760 |
|
1761 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
1762 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1763 &xmm_dst_lo, &xmm_dst_hi); |
|
1764 |
|
1765 save_128_aligned ( |
|
1766 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1767 |
|
1768 ps += 4; |
|
1769 pd += 4; |
|
1770 pm += 4; |
|
1771 w -= 4; |
|
1772 } |
|
1773 |
|
1774 while (w) |
|
1775 { |
|
1776 s = *ps++; |
|
1777 m = *pm++; |
|
1778 d = *pd; |
|
1779 |
|
1780 *pd++ = pack_1x128_32 ( |
|
1781 pix_multiply_1x128 ( |
|
1782 pix_multiply_1x128 ( |
|
1783 unpack_32_1x128 (s), unpack_32_1x128 (m)), |
|
1784 expand_alpha_1x128 (unpack_32_1x128 (d)))); |
|
1785 |
|
1786 w--; |
|
1787 } |
|
1788 } |
|
1789 |
|
1790 static void |
|
1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp, |
|
1792 pixman_op_t op, |
|
1793 uint32_t * pd, |
|
1794 const uint32_t * ps, |
|
1795 const uint32_t * pm, |
|
1796 int w) |
|
1797 { |
|
1798 uint32_t s, m, d; |
|
1799 |
|
1800 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
1801 __m128i xmm_src_lo, xmm_src_hi; |
|
1802 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1803 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1804 |
|
1805 while (w && (uintptr_t)pd & 15) |
|
1806 { |
|
1807 s = *ps++; |
|
1808 m = *pm++; |
|
1809 d = *pd; |
|
1810 |
|
1811 *pd++ = pack_1x128_32 ( |
|
1812 pix_multiply_1x128 ( |
|
1813 unpack_32_1x128 (d), |
|
1814 pix_multiply_1x128 (unpack_32_1x128 (m), |
|
1815 expand_alpha_1x128 (unpack_32_1x128 (s))))); |
|
1816 w--; |
|
1817 } |
|
1818 |
|
1819 while (w >= 4) |
|
1820 { |
|
1821 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
1822 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1823 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1824 |
|
1825 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1826 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1827 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1828 |
|
1829 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
1830 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1831 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
|
1832 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1833 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1834 |
|
1835 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
1836 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1837 &xmm_dst_lo, &xmm_dst_hi); |
|
1838 |
|
1839 save_128_aligned ( |
|
1840 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1841 |
|
1842 ps += 4; |
|
1843 pd += 4; |
|
1844 pm += 4; |
|
1845 w -= 4; |
|
1846 } |
|
1847 |
|
1848 while (w) |
|
1849 { |
|
1850 s = *ps++; |
|
1851 m = *pm++; |
|
1852 d = *pd; |
|
1853 |
|
1854 *pd++ = pack_1x128_32 ( |
|
1855 pix_multiply_1x128 ( |
|
1856 unpack_32_1x128 (d), |
|
1857 pix_multiply_1x128 (unpack_32_1x128 (m), |
|
1858 expand_alpha_1x128 (unpack_32_1x128 (s))))); |
|
1859 w--; |
|
1860 } |
|
1861 } |
|
1862 |
|
1863 static void |
|
1864 sse2_combine_out_ca (pixman_implementation_t *imp, |
|
1865 pixman_op_t op, |
|
1866 uint32_t * pd, |
|
1867 const uint32_t * ps, |
|
1868 const uint32_t * pm, |
|
1869 int w) |
|
1870 { |
|
1871 uint32_t s, m, d; |
|
1872 |
|
1873 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
1874 __m128i xmm_src_lo, xmm_src_hi; |
|
1875 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1876 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1877 |
|
1878 while (w && (uintptr_t)pd & 15) |
|
1879 { |
|
1880 s = *ps++; |
|
1881 m = *pm++; |
|
1882 d = *pd; |
|
1883 |
|
1884 *pd++ = pack_1x128_32 ( |
|
1885 pix_multiply_1x128 ( |
|
1886 pix_multiply_1x128 ( |
|
1887 unpack_32_1x128 (s), unpack_32_1x128 (m)), |
|
1888 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); |
|
1889 w--; |
|
1890 } |
|
1891 |
|
1892 while (w >= 4) |
|
1893 { |
|
1894 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
1895 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1896 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1897 |
|
1898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1900 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1901 |
|
1902 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
1903 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1904 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, |
|
1905 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1906 |
|
1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
1908 &xmm_mask_lo, &xmm_mask_hi, |
|
1909 &xmm_dst_lo, &xmm_dst_hi); |
|
1910 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
1911 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1912 &xmm_dst_lo, &xmm_dst_hi); |
|
1913 |
|
1914 save_128_aligned ( |
|
1915 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1916 |
|
1917 ps += 4; |
|
1918 pd += 4; |
|
1919 pm += 4; |
|
1920 w -= 4; |
|
1921 } |
|
1922 |
|
1923 while (w) |
|
1924 { |
|
1925 s = *ps++; |
|
1926 m = *pm++; |
|
1927 d = *pd; |
|
1928 |
|
1929 *pd++ = pack_1x128_32 ( |
|
1930 pix_multiply_1x128 ( |
|
1931 pix_multiply_1x128 ( |
|
1932 unpack_32_1x128 (s), unpack_32_1x128 (m)), |
|
1933 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); |
|
1934 |
|
1935 w--; |
|
1936 } |
|
1937 } |
|
1938 |
|
1939 static void |
|
1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp, |
|
1941 pixman_op_t op, |
|
1942 uint32_t * pd, |
|
1943 const uint32_t * ps, |
|
1944 const uint32_t * pm, |
|
1945 int w) |
|
1946 { |
|
1947 uint32_t s, m, d; |
|
1948 |
|
1949 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
1950 __m128i xmm_src_lo, xmm_src_hi; |
|
1951 __m128i xmm_dst_lo, xmm_dst_hi; |
|
1952 __m128i xmm_mask_lo, xmm_mask_hi; |
|
1953 |
|
1954 while (w && (uintptr_t)pd & 15) |
|
1955 { |
|
1956 s = *ps++; |
|
1957 m = *pm++; |
|
1958 d = *pd; |
|
1959 |
|
1960 *pd++ = pack_1x128_32 ( |
|
1961 pix_multiply_1x128 ( |
|
1962 unpack_32_1x128 (d), |
|
1963 negate_1x128 (pix_multiply_1x128 ( |
|
1964 unpack_32_1x128 (m), |
|
1965 expand_alpha_1x128 (unpack_32_1x128 (s)))))); |
|
1966 w--; |
|
1967 } |
|
1968 |
|
1969 while (w >= 4) |
|
1970 { |
|
1971 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
1972 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
1973 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
1974 |
|
1975 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
1976 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
1977 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
1978 |
|
1979 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
1980 &xmm_alpha_lo, &xmm_alpha_hi); |
|
1981 |
|
1982 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
|
1983 &xmm_alpha_lo, &xmm_alpha_hi, |
|
1984 &xmm_mask_lo, &xmm_mask_hi); |
|
1985 |
|
1986 negate_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
1987 &xmm_mask_lo, &xmm_mask_hi); |
|
1988 |
|
1989 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
1990 &xmm_mask_lo, &xmm_mask_hi, |
|
1991 &xmm_dst_lo, &xmm_dst_hi); |
|
1992 |
|
1993 save_128_aligned ( |
|
1994 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
1995 |
|
1996 ps += 4; |
|
1997 pd += 4; |
|
1998 pm += 4; |
|
1999 w -= 4; |
|
2000 } |
|
2001 |
|
2002 while (w) |
|
2003 { |
|
2004 s = *ps++; |
|
2005 m = *pm++; |
|
2006 d = *pd; |
|
2007 |
|
2008 *pd++ = pack_1x128_32 ( |
|
2009 pix_multiply_1x128 ( |
|
2010 unpack_32_1x128 (d), |
|
2011 negate_1x128 (pix_multiply_1x128 ( |
|
2012 unpack_32_1x128 (m), |
|
2013 expand_alpha_1x128 (unpack_32_1x128 (s)))))); |
|
2014 w--; |
|
2015 } |
|
2016 } |
|
2017 |
|
2018 static force_inline uint32_t |
|
2019 core_combine_atop_ca_pixel_sse2 (uint32_t src, |
|
2020 uint32_t mask, |
|
2021 uint32_t dst) |
|
2022 { |
|
2023 __m128i m = unpack_32_1x128 (mask); |
|
2024 __m128i s = unpack_32_1x128 (src); |
|
2025 __m128i d = unpack_32_1x128 (dst); |
|
2026 __m128i sa = expand_alpha_1x128 (s); |
|
2027 __m128i da = expand_alpha_1x128 (d); |
|
2028 |
|
2029 s = pix_multiply_1x128 (s, m); |
|
2030 m = negate_1x128 (pix_multiply_1x128 (m, sa)); |
|
2031 |
|
2032 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); |
|
2033 } |
|
2034 |
|
2035 static void |
|
2036 sse2_combine_atop_ca (pixman_implementation_t *imp, |
|
2037 pixman_op_t op, |
|
2038 uint32_t * pd, |
|
2039 const uint32_t * ps, |
|
2040 const uint32_t * pm, |
|
2041 int w) |
|
2042 { |
|
2043 uint32_t s, m, d; |
|
2044 |
|
2045 __m128i xmm_src_lo, xmm_src_hi; |
|
2046 __m128i xmm_dst_lo, xmm_dst_hi; |
|
2047 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
|
2048 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
|
2049 __m128i xmm_mask_lo, xmm_mask_hi; |
|
2050 |
|
2051 while (w && (uintptr_t)pd & 15) |
|
2052 { |
|
2053 s = *ps++; |
|
2054 m = *pm++; |
|
2055 d = *pd; |
|
2056 |
|
2057 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); |
|
2058 w--; |
|
2059 } |
|
2060 |
|
2061 while (w >= 4) |
|
2062 { |
|
2063 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
2064 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
2065 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
2066 |
|
2067 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
2068 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
2069 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
2070 |
|
2071 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
2072 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
2073 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
2074 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
2075 |
|
2076 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
2077 &xmm_mask_lo, &xmm_mask_hi, |
|
2078 &xmm_src_lo, &xmm_src_hi); |
|
2079 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
|
2080 &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
|
2081 &xmm_mask_lo, &xmm_mask_hi); |
|
2082 |
|
2083 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
2084 |
|
2085 pix_add_multiply_2x128 ( |
|
2086 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
|
2087 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
|
2088 &xmm_dst_lo, &xmm_dst_hi); |
|
2089 |
|
2090 save_128_aligned ( |
|
2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
2092 |
|
2093 ps += 4; |
|
2094 pd += 4; |
|
2095 pm += 4; |
|
2096 w -= 4; |
|
2097 } |
|
2098 |
|
2099 while (w) |
|
2100 { |
|
2101 s = *ps++; |
|
2102 m = *pm++; |
|
2103 d = *pd; |
|
2104 |
|
2105 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); |
|
2106 w--; |
|
2107 } |
|
2108 } |
|
2109 |
|
2110 static force_inline uint32_t |
|
2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, |
|
2112 uint32_t mask, |
|
2113 uint32_t dst) |
|
2114 { |
|
2115 __m128i m = unpack_32_1x128 (mask); |
|
2116 __m128i s = unpack_32_1x128 (src); |
|
2117 __m128i d = unpack_32_1x128 (dst); |
|
2118 |
|
2119 __m128i da = negate_1x128 (expand_alpha_1x128 (d)); |
|
2120 __m128i sa = expand_alpha_1x128 (s); |
|
2121 |
|
2122 s = pix_multiply_1x128 (s, m); |
|
2123 m = pix_multiply_1x128 (m, sa); |
|
2124 |
|
2125 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); |
|
2126 } |
|
2127 |
|
2128 static void |
|
2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, |
|
2130 pixman_op_t op, |
|
2131 uint32_t * pd, |
|
2132 const uint32_t * ps, |
|
2133 const uint32_t * pm, |
|
2134 int w) |
|
2135 { |
|
2136 uint32_t s, m, d; |
|
2137 |
|
2138 __m128i xmm_src_lo, xmm_src_hi; |
|
2139 __m128i xmm_dst_lo, xmm_dst_hi; |
|
2140 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
|
2141 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
|
2142 __m128i xmm_mask_lo, xmm_mask_hi; |
|
2143 |
|
2144 while (w && (uintptr_t)pd & 15) |
|
2145 { |
|
2146 s = *ps++; |
|
2147 m = *pm++; |
|
2148 d = *pd; |
|
2149 |
|
2150 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); |
|
2151 w--; |
|
2152 } |
|
2153 |
|
2154 while (w >= 4) |
|
2155 { |
|
2156 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
2157 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
2158 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
2159 |
|
2160 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
2161 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
2162 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
2163 |
|
2164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
2165 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
2166 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
2167 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
2168 |
|
2169 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
2170 &xmm_mask_lo, &xmm_mask_hi, |
|
2171 &xmm_src_lo, &xmm_src_hi); |
|
2172 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
|
2173 &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
|
2174 &xmm_mask_lo, &xmm_mask_hi); |
|
2175 |
|
2176 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
|
2177 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
2178 |
|
2179 pix_add_multiply_2x128 ( |
|
2180 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
|
2181 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
|
2182 &xmm_dst_lo, &xmm_dst_hi); |
|
2183 |
|
2184 save_128_aligned ( |
|
2185 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
2186 |
|
2187 ps += 4; |
|
2188 pd += 4; |
|
2189 pm += 4; |
|
2190 w -= 4; |
|
2191 } |
|
2192 |
|
2193 while (w) |
|
2194 { |
|
2195 s = *ps++; |
|
2196 m = *pm++; |
|
2197 d = *pd; |
|
2198 |
|
2199 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); |
|
2200 w--; |
|
2201 } |
|
2202 } |
|
2203 |
|
2204 static force_inline uint32_t |
|
2205 core_combine_xor_ca_pixel_sse2 (uint32_t src, |
|
2206 uint32_t mask, |
|
2207 uint32_t dst) |
|
2208 { |
|
2209 __m128i a = unpack_32_1x128 (mask); |
|
2210 __m128i s = unpack_32_1x128 (src); |
|
2211 __m128i d = unpack_32_1x128 (dst); |
|
2212 |
|
2213 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( |
|
2214 a, expand_alpha_1x128 (s))); |
|
2215 __m128i dest = pix_multiply_1x128 (s, a); |
|
2216 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); |
|
2217 |
|
2218 return pack_1x128_32 (pix_add_multiply_1x128 (&d, |
|
2219 &alpha_dst, |
|
2220 &dest, |
|
2221 &alpha_src)); |
|
2222 } |
|
2223 |
|
2224 static void |
|
2225 sse2_combine_xor_ca (pixman_implementation_t *imp, |
|
2226 pixman_op_t op, |
|
2227 uint32_t * pd, |
|
2228 const uint32_t * ps, |
|
2229 const uint32_t * pm, |
|
2230 int w) |
|
2231 { |
|
2232 uint32_t s, m, d; |
|
2233 |
|
2234 __m128i xmm_src_lo, xmm_src_hi; |
|
2235 __m128i xmm_dst_lo, xmm_dst_hi; |
|
2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
|
2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
|
2238 __m128i xmm_mask_lo, xmm_mask_hi; |
|
2239 |
|
2240 while (w && (uintptr_t)pd & 15) |
|
2241 { |
|
2242 s = *ps++; |
|
2243 m = *pm++; |
|
2244 d = *pd; |
|
2245 |
|
2246 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); |
|
2247 w--; |
|
2248 } |
|
2249 |
|
2250 while (w >= 4) |
|
2251 { |
|
2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
2255 |
|
2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
2259 |
|
2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
|
2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
|
2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
2264 |
|
2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
2266 &xmm_mask_lo, &xmm_mask_hi, |
|
2267 &xmm_src_lo, &xmm_src_hi); |
|
2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
|
2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
|
2270 &xmm_mask_lo, &xmm_mask_hi); |
|
2271 |
|
2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
|
2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
|
2274 negate_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
2275 &xmm_mask_lo, &xmm_mask_hi); |
|
2276 |
|
2277 pix_add_multiply_2x128 ( |
|
2278 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
|
2279 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
|
2280 &xmm_dst_lo, &xmm_dst_hi); |
|
2281 |
|
2282 save_128_aligned ( |
|
2283 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
2284 |
|
2285 ps += 4; |
|
2286 pd += 4; |
|
2287 pm += 4; |
|
2288 w -= 4; |
|
2289 } |
|
2290 |
|
2291 while (w) |
|
2292 { |
|
2293 s = *ps++; |
|
2294 m = *pm++; |
|
2295 d = *pd; |
|
2296 |
|
2297 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); |
|
2298 w--; |
|
2299 } |
|
2300 } |
|
2301 |
|
2302 static void |
|
2303 sse2_combine_add_ca (pixman_implementation_t *imp, |
|
2304 pixman_op_t op, |
|
2305 uint32_t * pd, |
|
2306 const uint32_t * ps, |
|
2307 const uint32_t * pm, |
|
2308 int w) |
|
2309 { |
|
2310 uint32_t s, m, d; |
|
2311 |
|
2312 __m128i xmm_src_lo, xmm_src_hi; |
|
2313 __m128i xmm_dst_lo, xmm_dst_hi; |
|
2314 __m128i xmm_mask_lo, xmm_mask_hi; |
|
2315 |
|
2316 while (w && (uintptr_t)pd & 15) |
|
2317 { |
|
2318 s = *ps++; |
|
2319 m = *pm++; |
|
2320 d = *pd; |
|
2321 |
|
2322 *pd++ = pack_1x128_32 ( |
|
2323 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), |
|
2324 unpack_32_1x128 (m)), |
|
2325 unpack_32_1x128 (d))); |
|
2326 w--; |
|
2327 } |
|
2328 |
|
2329 while (w >= 4) |
|
2330 { |
|
2331 xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
|
2332 xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
|
2333 xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
|
2334 |
|
2335 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
2336 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
2337 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
2338 |
|
2339 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
2340 &xmm_mask_lo, &xmm_mask_hi, |
|
2341 &xmm_src_lo, &xmm_src_hi); |
|
2342 |
|
2343 save_128_aligned ( |
|
2344 (__m128i*)pd, pack_2x128_128 ( |
|
2345 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), |
|
2346 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); |
|
2347 |
|
2348 ps += 4; |
|
2349 pd += 4; |
|
2350 pm += 4; |
|
2351 w -= 4; |
|
2352 } |
|
2353 |
|
2354 while (w) |
|
2355 { |
|
2356 s = *ps++; |
|
2357 m = *pm++; |
|
2358 d = *pd; |
|
2359 |
|
2360 *pd++ = pack_1x128_32 ( |
|
2361 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), |
|
2362 unpack_32_1x128 (m)), |
|
2363 unpack_32_1x128 (d))); |
|
2364 w--; |
|
2365 } |
|
2366 } |
|
2367 |
|
2368 static force_inline __m128i |
|
2369 create_mask_16_128 (uint16_t mask) |
|
2370 { |
|
2371 return _mm_set1_epi16 (mask); |
|
2372 } |
|
2373 |
|
2374 /* Work around a code generation bug in Sun Studio 12. */ |
|
2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) |
|
2376 # define create_mask_2x32_128(mask0, mask1) \ |
|
2377 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) |
|
2378 #else |
|
2379 static force_inline __m128i |
|
2380 create_mask_2x32_128 (uint32_t mask0, |
|
2381 uint32_t mask1) |
|
2382 { |
|
2383 return _mm_set_epi32 (mask0, mask1, mask0, mask1); |
|
2384 } |
|
2385 #endif |
|
2386 |
|
2387 static void |
|
2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp, |
|
2389 pixman_composite_info_t *info) |
|
2390 { |
|
2391 PIXMAN_COMPOSITE_ARGS (info); |
|
2392 uint32_t src; |
|
2393 uint32_t *dst_line, *dst, d; |
|
2394 int32_t w; |
|
2395 int dst_stride; |
|
2396 __m128i xmm_src, xmm_alpha; |
|
2397 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
2398 |
|
2399 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2400 |
|
2401 if (src == 0) |
|
2402 return; |
|
2403 |
|
2404 PIXMAN_IMAGE_GET_LINE ( |
|
2405 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2406 |
|
2407 xmm_src = expand_pixel_32_1x128 (src); |
|
2408 xmm_alpha = expand_alpha_1x128 (xmm_src); |
|
2409 |
|
2410 while (height--) |
|
2411 { |
|
2412 dst = dst_line; |
|
2413 |
|
2414 dst_line += dst_stride; |
|
2415 w = width; |
|
2416 |
|
2417 while (w && (uintptr_t)dst & 15) |
|
2418 { |
|
2419 d = *dst; |
|
2420 *dst++ = pack_1x128_32 (over_1x128 (xmm_src, |
|
2421 xmm_alpha, |
|
2422 unpack_32_1x128 (d))); |
|
2423 w--; |
|
2424 } |
|
2425 |
|
2426 while (w >= 4) |
|
2427 { |
|
2428 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
2429 |
|
2430 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
2431 |
|
2432 over_2x128 (&xmm_src, &xmm_src, |
|
2433 &xmm_alpha, &xmm_alpha, |
|
2434 &xmm_dst_lo, &xmm_dst_hi); |
|
2435 |
|
2436 /* rebuid the 4 pixel data and save*/ |
|
2437 save_128_aligned ( |
|
2438 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
2439 |
|
2440 w -= 4; |
|
2441 dst += 4; |
|
2442 } |
|
2443 |
|
2444 while (w) |
|
2445 { |
|
2446 d = *dst; |
|
2447 *dst++ = pack_1x128_32 (over_1x128 (xmm_src, |
|
2448 xmm_alpha, |
|
2449 unpack_32_1x128 (d))); |
|
2450 w--; |
|
2451 } |
|
2452 |
|
2453 } |
|
2454 } |
|
2455 |
|
2456 static void |
|
2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp, |
|
2458 pixman_composite_info_t *info) |
|
2459 { |
|
2460 PIXMAN_COMPOSITE_ARGS (info); |
|
2461 uint32_t src; |
|
2462 uint16_t *dst_line, *dst, d; |
|
2463 int32_t w; |
|
2464 int dst_stride; |
|
2465 __m128i xmm_src, xmm_alpha; |
|
2466 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
|
2467 |
|
2468 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2469 |
|
2470 if (src == 0) |
|
2471 return; |
|
2472 |
|
2473 PIXMAN_IMAGE_GET_LINE ( |
|
2474 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
2475 |
|
2476 xmm_src = expand_pixel_32_1x128 (src); |
|
2477 xmm_alpha = expand_alpha_1x128 (xmm_src); |
|
2478 |
|
2479 while (height--) |
|
2480 { |
|
2481 dst = dst_line; |
|
2482 |
|
2483 dst_line += dst_stride; |
|
2484 w = width; |
|
2485 |
|
2486 while (w && (uintptr_t)dst & 15) |
|
2487 { |
|
2488 d = *dst; |
|
2489 |
|
2490 *dst++ = pack_565_32_16 ( |
|
2491 pack_1x128_32 (over_1x128 (xmm_src, |
|
2492 xmm_alpha, |
|
2493 expand565_16_1x128 (d)))); |
|
2494 w--; |
|
2495 } |
|
2496 |
|
2497 while (w >= 8) |
|
2498 { |
|
2499 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
2500 |
|
2501 unpack_565_128_4x128 (xmm_dst, |
|
2502 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
|
2503 |
|
2504 over_2x128 (&xmm_src, &xmm_src, |
|
2505 &xmm_alpha, &xmm_alpha, |
|
2506 &xmm_dst0, &xmm_dst1); |
|
2507 over_2x128 (&xmm_src, &xmm_src, |
|
2508 &xmm_alpha, &xmm_alpha, |
|
2509 &xmm_dst2, &xmm_dst3); |
|
2510 |
|
2511 xmm_dst = pack_565_4x128_128 ( |
|
2512 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
|
2513 |
|
2514 save_128_aligned ((__m128i*)dst, xmm_dst); |
|
2515 |
|
2516 dst += 8; |
|
2517 w -= 8; |
|
2518 } |
|
2519 |
|
2520 while (w--) |
|
2521 { |
|
2522 d = *dst; |
|
2523 *dst++ = pack_565_32_16 ( |
|
2524 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, |
|
2525 expand565_16_1x128 (d)))); |
|
2526 } |
|
2527 } |
|
2528 |
|
2529 } |
|
2530 |
|
2531 static void |
|
2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, |
|
2533 pixman_composite_info_t *info) |
|
2534 { |
|
2535 PIXMAN_COMPOSITE_ARGS (info); |
|
2536 uint32_t src; |
|
2537 uint32_t *dst_line, d; |
|
2538 uint32_t *mask_line, m; |
|
2539 uint32_t pack_cmp; |
|
2540 int dst_stride, mask_stride; |
|
2541 |
|
2542 __m128i xmm_src; |
|
2543 __m128i xmm_dst; |
|
2544 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
2545 |
|
2546 __m128i mmx_src, mmx_mask, mmx_dest; |
|
2547 |
|
2548 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2549 |
|
2550 if (src == 0) |
|
2551 return; |
|
2552 |
|
2553 PIXMAN_IMAGE_GET_LINE ( |
|
2554 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2555 PIXMAN_IMAGE_GET_LINE ( |
|
2556 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
|
2557 |
|
2558 xmm_src = _mm_unpacklo_epi8 ( |
|
2559 create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); |
|
2560 mmx_src = xmm_src; |
|
2561 |
|
2562 while (height--) |
|
2563 { |
|
2564 int w = width; |
|
2565 const uint32_t *pm = (uint32_t *)mask_line; |
|
2566 uint32_t *pd = (uint32_t *)dst_line; |
|
2567 |
|
2568 dst_line += dst_stride; |
|
2569 mask_line += mask_stride; |
|
2570 |
|
2571 while (w && (uintptr_t)pd & 15) |
|
2572 { |
|
2573 m = *pm++; |
|
2574 |
|
2575 if (m) |
|
2576 { |
|
2577 d = *pd; |
|
2578 |
|
2579 mmx_mask = unpack_32_1x128 (m); |
|
2580 mmx_dest = unpack_32_1x128 (d); |
|
2581 |
|
2582 *pd = pack_1x128_32 ( |
|
2583 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), |
|
2584 mmx_dest)); |
|
2585 } |
|
2586 |
|
2587 pd++; |
|
2588 w--; |
|
2589 } |
|
2590 |
|
2591 while (w >= 4) |
|
2592 { |
|
2593 xmm_mask = load_128_unaligned ((__m128i*)pm); |
|
2594 |
|
2595 pack_cmp = |
|
2596 _mm_movemask_epi8 ( |
|
2597 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
|
2598 |
|
2599 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ |
|
2600 if (pack_cmp != 0xffff) |
|
2601 { |
|
2602 xmm_dst = load_128_aligned ((__m128i*)pd); |
|
2603 |
|
2604 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
2605 |
|
2606 pix_multiply_2x128 (&xmm_src, &xmm_src, |
|
2607 &xmm_mask_lo, &xmm_mask_hi, |
|
2608 &xmm_mask_lo, &xmm_mask_hi); |
|
2609 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); |
|
2610 |
|
2611 save_128_aligned ( |
|
2612 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); |
|
2613 } |
|
2614 |
|
2615 pd += 4; |
|
2616 pm += 4; |
|
2617 w -= 4; |
|
2618 } |
|
2619 |
|
2620 while (w) |
|
2621 { |
|
2622 m = *pm++; |
|
2623 |
|
2624 if (m) |
|
2625 { |
|
2626 d = *pd; |
|
2627 |
|
2628 mmx_mask = unpack_32_1x128 (m); |
|
2629 mmx_dest = unpack_32_1x128 (d); |
|
2630 |
|
2631 *pd = pack_1x128_32 ( |
|
2632 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), |
|
2633 mmx_dest)); |
|
2634 } |
|
2635 |
|
2636 pd++; |
|
2637 w--; |
|
2638 } |
|
2639 } |
|
2640 |
|
2641 } |
|
2642 |
|
2643 static void |
|
2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
|
2645 pixman_composite_info_t *info) |
|
2646 { |
|
2647 PIXMAN_COMPOSITE_ARGS (info); |
|
2648 uint32_t src; |
|
2649 uint32_t *dst_line, d; |
|
2650 uint32_t *mask_line, m; |
|
2651 uint32_t pack_cmp; |
|
2652 int dst_stride, mask_stride; |
|
2653 |
|
2654 __m128i xmm_src, xmm_alpha; |
|
2655 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
2656 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
2657 |
|
2658 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
|
2659 |
|
2660 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2661 |
|
2662 if (src == 0) |
|
2663 return; |
|
2664 |
|
2665 PIXMAN_IMAGE_GET_LINE ( |
|
2666 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2667 PIXMAN_IMAGE_GET_LINE ( |
|
2668 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
|
2669 |
|
2670 xmm_src = _mm_unpacklo_epi8 ( |
|
2671 create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); |
|
2672 xmm_alpha = expand_alpha_1x128 (xmm_src); |
|
2673 mmx_src = xmm_src; |
|
2674 mmx_alpha = xmm_alpha; |
|
2675 |
|
2676 while (height--) |
|
2677 { |
|
2678 int w = width; |
|
2679 const uint32_t *pm = (uint32_t *)mask_line; |
|
2680 uint32_t *pd = (uint32_t *)dst_line; |
|
2681 |
|
2682 dst_line += dst_stride; |
|
2683 mask_line += mask_stride; |
|
2684 |
|
2685 while (w && (uintptr_t)pd & 15) |
|
2686 { |
|
2687 m = *pm++; |
|
2688 |
|
2689 if (m) |
|
2690 { |
|
2691 d = *pd; |
|
2692 mmx_mask = unpack_32_1x128 (m); |
|
2693 mmx_dest = unpack_32_1x128 (d); |
|
2694 |
|
2695 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, |
|
2696 &mmx_alpha, |
|
2697 &mmx_mask, |
|
2698 &mmx_dest)); |
|
2699 } |
|
2700 |
|
2701 pd++; |
|
2702 w--; |
|
2703 } |
|
2704 |
|
2705 while (w >= 4) |
|
2706 { |
|
2707 xmm_mask = load_128_unaligned ((__m128i*)pm); |
|
2708 |
|
2709 pack_cmp = |
|
2710 _mm_movemask_epi8 ( |
|
2711 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
|
2712 |
|
2713 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ |
|
2714 if (pack_cmp != 0xffff) |
|
2715 { |
|
2716 xmm_dst = load_128_aligned ((__m128i*)pd); |
|
2717 |
|
2718 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
2719 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
2720 |
|
2721 in_over_2x128 (&xmm_src, &xmm_src, |
|
2722 &xmm_alpha, &xmm_alpha, |
|
2723 &xmm_mask_lo, &xmm_mask_hi, |
|
2724 &xmm_dst_lo, &xmm_dst_hi); |
|
2725 |
|
2726 save_128_aligned ( |
|
2727 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
2728 } |
|
2729 |
|
2730 pd += 4; |
|
2731 pm += 4; |
|
2732 w -= 4; |
|
2733 } |
|
2734 |
|
2735 while (w) |
|
2736 { |
|
2737 m = *pm++; |
|
2738 |
|
2739 if (m) |
|
2740 { |
|
2741 d = *pd; |
|
2742 mmx_mask = unpack_32_1x128 (m); |
|
2743 mmx_dest = unpack_32_1x128 (d); |
|
2744 |
|
2745 *pd = pack_1x128_32 ( |
|
2746 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); |
|
2747 } |
|
2748 |
|
2749 pd++; |
|
2750 w--; |
|
2751 } |
|
2752 } |
|
2753 |
|
2754 } |
|
2755 |
|
2756 static void |
|
2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
|
2758 pixman_composite_info_t *info) |
|
2759 { |
|
2760 PIXMAN_COMPOSITE_ARGS (info); |
|
2761 uint32_t *dst_line, *dst; |
|
2762 uint32_t *src_line, *src; |
|
2763 uint32_t mask; |
|
2764 int32_t w; |
|
2765 int dst_stride, src_stride; |
|
2766 |
|
2767 __m128i xmm_mask; |
|
2768 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
2769 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
2770 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
2771 |
|
2772 PIXMAN_IMAGE_GET_LINE ( |
|
2773 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2774 PIXMAN_IMAGE_GET_LINE ( |
|
2775 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2776 |
|
2777 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); |
|
2778 |
|
2779 xmm_mask = create_mask_16_128 (mask >> 24); |
|
2780 |
|
2781 while (height--) |
|
2782 { |
|
2783 dst = dst_line; |
|
2784 dst_line += dst_stride; |
|
2785 src = src_line; |
|
2786 src_line += src_stride; |
|
2787 w = width; |
|
2788 |
|
2789 while (w && (uintptr_t)dst & 15) |
|
2790 { |
|
2791 uint32_t s = *src++; |
|
2792 |
|
2793 if (s) |
|
2794 { |
|
2795 uint32_t d = *dst; |
|
2796 |
|
2797 __m128i ms = unpack_32_1x128 (s); |
|
2798 __m128i alpha = expand_alpha_1x128 (ms); |
|
2799 __m128i dest = xmm_mask; |
|
2800 __m128i alpha_dst = unpack_32_1x128 (d); |
|
2801 |
|
2802 *dst = pack_1x128_32 ( |
|
2803 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
|
2804 } |
|
2805 dst++; |
|
2806 w--; |
|
2807 } |
|
2808 |
|
2809 while (w >= 4) |
|
2810 { |
|
2811 xmm_src = load_128_unaligned ((__m128i*)src); |
|
2812 |
|
2813 if (!is_zero (xmm_src)) |
|
2814 { |
|
2815 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
2816 |
|
2817 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
2818 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
2819 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
2820 &xmm_alpha_lo, &xmm_alpha_hi); |
|
2821 |
|
2822 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
2823 &xmm_alpha_lo, &xmm_alpha_hi, |
|
2824 &xmm_mask, &xmm_mask, |
|
2825 &xmm_dst_lo, &xmm_dst_hi); |
|
2826 |
|
2827 save_128_aligned ( |
|
2828 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
2829 } |
|
2830 |
|
2831 dst += 4; |
|
2832 src += 4; |
|
2833 w -= 4; |
|
2834 } |
|
2835 |
|
2836 while (w) |
|
2837 { |
|
2838 uint32_t s = *src++; |
|
2839 |
|
2840 if (s) |
|
2841 { |
|
2842 uint32_t d = *dst; |
|
2843 |
|
2844 __m128i ms = unpack_32_1x128 (s); |
|
2845 __m128i alpha = expand_alpha_1x128 (ms); |
|
2846 __m128i mask = xmm_mask; |
|
2847 __m128i dest = unpack_32_1x128 (d); |
|
2848 |
|
2849 *dst = pack_1x128_32 ( |
|
2850 in_over_1x128 (&ms, &alpha, &mask, &dest)); |
|
2851 } |
|
2852 |
|
2853 dst++; |
|
2854 w--; |
|
2855 } |
|
2856 } |
|
2857 |
|
2858 } |
|
2859 |
|
2860 static void |
|
2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp, |
|
2862 pixman_composite_info_t *info) |
|
2863 { |
|
2864 PIXMAN_COMPOSITE_ARGS (info); |
|
2865 uint16_t *dst_line, *dst; |
|
2866 uint32_t *src_line, *src, s; |
|
2867 int dst_stride, src_stride; |
|
2868 int32_t w; |
|
2869 |
|
2870 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2871 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
2872 |
|
2873 while (height--) |
|
2874 { |
|
2875 dst = dst_line; |
|
2876 dst_line += dst_stride; |
|
2877 src = src_line; |
|
2878 src_line += src_stride; |
|
2879 w = width; |
|
2880 |
|
2881 while (w && (uintptr_t)dst & 15) |
|
2882 { |
|
2883 s = *src++; |
|
2884 *dst = convert_8888_to_0565 (s); |
|
2885 dst++; |
|
2886 w--; |
|
2887 } |
|
2888 |
|
2889 while (w >= 8) |
|
2890 { |
|
2891 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); |
|
2892 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); |
|
2893 |
|
2894 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); |
|
2895 |
|
2896 w -= 8; |
|
2897 src += 8; |
|
2898 dst += 8; |
|
2899 } |
|
2900 |
|
2901 while (w) |
|
2902 { |
|
2903 s = *src++; |
|
2904 *dst = convert_8888_to_0565 (s); |
|
2905 dst++; |
|
2906 w--; |
|
2907 } |
|
2908 } |
|
2909 } |
|
2910 |
|
2911 static void |
|
2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp, |
|
2913 pixman_composite_info_t *info) |
|
2914 { |
|
2915 PIXMAN_COMPOSITE_ARGS (info); |
|
2916 uint32_t *dst_line, *dst; |
|
2917 uint32_t *src_line, *src; |
|
2918 int32_t w; |
|
2919 int dst_stride, src_stride; |
|
2920 |
|
2921 |
|
2922 PIXMAN_IMAGE_GET_LINE ( |
|
2923 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2924 PIXMAN_IMAGE_GET_LINE ( |
|
2925 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2926 |
|
2927 while (height--) |
|
2928 { |
|
2929 dst = dst_line; |
|
2930 dst_line += dst_stride; |
|
2931 src = src_line; |
|
2932 src_line += src_stride; |
|
2933 w = width; |
|
2934 |
|
2935 while (w && (uintptr_t)dst & 15) |
|
2936 { |
|
2937 *dst++ = *src++ | 0xff000000; |
|
2938 w--; |
|
2939 } |
|
2940 |
|
2941 while (w >= 16) |
|
2942 { |
|
2943 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; |
|
2944 |
|
2945 xmm_src1 = load_128_unaligned ((__m128i*)src + 0); |
|
2946 xmm_src2 = load_128_unaligned ((__m128i*)src + 1); |
|
2947 xmm_src3 = load_128_unaligned ((__m128i*)src + 2); |
|
2948 xmm_src4 = load_128_unaligned ((__m128i*)src + 3); |
|
2949 |
|
2950 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); |
|
2951 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); |
|
2952 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); |
|
2953 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); |
|
2954 |
|
2955 dst += 16; |
|
2956 src += 16; |
|
2957 w -= 16; |
|
2958 } |
|
2959 |
|
2960 while (w) |
|
2961 { |
|
2962 *dst++ = *src++ | 0xff000000; |
|
2963 w--; |
|
2964 } |
|
2965 } |
|
2966 |
|
2967 } |
|
2968 |
|
2969 static void |
|
2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
|
2971 pixman_composite_info_t *info) |
|
2972 { |
|
2973 PIXMAN_COMPOSITE_ARGS (info); |
|
2974 uint32_t *dst_line, *dst; |
|
2975 uint32_t *src_line, *src; |
|
2976 uint32_t mask; |
|
2977 int dst_stride, src_stride; |
|
2978 int32_t w; |
|
2979 |
|
2980 __m128i xmm_mask, xmm_alpha; |
|
2981 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
2982 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
2983 |
|
2984 PIXMAN_IMAGE_GET_LINE ( |
|
2985 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2986 PIXMAN_IMAGE_GET_LINE ( |
|
2987 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2988 |
|
2989 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); |
|
2990 |
|
2991 xmm_mask = create_mask_16_128 (mask >> 24); |
|
2992 xmm_alpha = mask_00ff; |
|
2993 |
|
2994 while (height--) |
|
2995 { |
|
2996 dst = dst_line; |
|
2997 dst_line += dst_stride; |
|
2998 src = src_line; |
|
2999 src_line += src_stride; |
|
3000 w = width; |
|
3001 |
|
3002 while (w && (uintptr_t)dst & 15) |
|
3003 { |
|
3004 uint32_t s = (*src++) | 0xff000000; |
|
3005 uint32_t d = *dst; |
|
3006 |
|
3007 __m128i src = unpack_32_1x128 (s); |
|
3008 __m128i alpha = xmm_alpha; |
|
3009 __m128i mask = xmm_mask; |
|
3010 __m128i dest = unpack_32_1x128 (d); |
|
3011 |
|
3012 *dst++ = pack_1x128_32 ( |
|
3013 in_over_1x128 (&src, &alpha, &mask, &dest)); |
|
3014 |
|
3015 w--; |
|
3016 } |
|
3017 |
|
3018 while (w >= 4) |
|
3019 { |
|
3020 xmm_src = _mm_or_si128 ( |
|
3021 load_128_unaligned ((__m128i*)src), mask_ff000000); |
|
3022 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
3023 |
|
3024 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
3025 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
3026 |
|
3027 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
3028 &xmm_alpha, &xmm_alpha, |
|
3029 &xmm_mask, &xmm_mask, |
|
3030 &xmm_dst_lo, &xmm_dst_hi); |
|
3031 |
|
3032 save_128_aligned ( |
|
3033 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
3034 |
|
3035 dst += 4; |
|
3036 src += 4; |
|
3037 w -= 4; |
|
3038 |
|
3039 } |
|
3040 |
|
3041 while (w) |
|
3042 { |
|
3043 uint32_t s = (*src++) | 0xff000000; |
|
3044 uint32_t d = *dst; |
|
3045 |
|
3046 __m128i src = unpack_32_1x128 (s); |
|
3047 __m128i alpha = xmm_alpha; |
|
3048 __m128i mask = xmm_mask; |
|
3049 __m128i dest = unpack_32_1x128 (d); |
|
3050 |
|
3051 *dst++ = pack_1x128_32 ( |
|
3052 in_over_1x128 (&src, &alpha, &mask, &dest)); |
|
3053 |
|
3054 w--; |
|
3055 } |
|
3056 } |
|
3057 |
|
3058 } |
|
3059 |
|
3060 static void |
|
3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp, |
|
3062 pixman_composite_info_t *info) |
|
3063 { |
|
3064 PIXMAN_COMPOSITE_ARGS (info); |
|
3065 int dst_stride, src_stride; |
|
3066 uint32_t *dst_line, *dst; |
|
3067 uint32_t *src_line, *src; |
|
3068 |
|
3069 PIXMAN_IMAGE_GET_LINE ( |
|
3070 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3071 PIXMAN_IMAGE_GET_LINE ( |
|
3072 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
3073 |
|
3074 dst = dst_line; |
|
3075 src = src_line; |
|
3076 |
|
3077 while (height--) |
|
3078 { |
|
3079 sse2_combine_over_u (imp, op, dst, src, NULL, width); |
|
3080 |
|
3081 dst += dst_stride; |
|
3082 src += src_stride; |
|
3083 } |
|
3084 } |
|
3085 |
|
3086 static force_inline uint16_t |
|
3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst) |
|
3088 { |
|
3089 __m128i ms; |
|
3090 |
|
3091 ms = unpack_32_1x128 (src); |
|
3092 return pack_565_32_16 ( |
|
3093 pack_1x128_32 ( |
|
3094 over_1x128 ( |
|
3095 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); |
|
3096 } |
|
3097 |
|
3098 static void |
|
3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp, |
|
3100 pixman_composite_info_t *info) |
|
3101 { |
|
3102 PIXMAN_COMPOSITE_ARGS (info); |
|
3103 uint16_t *dst_line, *dst, d; |
|
3104 uint32_t *src_line, *src, s; |
|
3105 int dst_stride, src_stride; |
|
3106 int32_t w; |
|
3107 |
|
3108 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
3109 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
3110 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
|
3111 |
|
3112 PIXMAN_IMAGE_GET_LINE ( |
|
3113 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
3114 PIXMAN_IMAGE_GET_LINE ( |
|
3115 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
3116 |
|
3117 while (height--) |
|
3118 { |
|
3119 dst = dst_line; |
|
3120 src = src_line; |
|
3121 |
|
3122 dst_line += dst_stride; |
|
3123 src_line += src_stride; |
|
3124 w = width; |
|
3125 |
|
3126 /* Align dst on a 16-byte boundary */ |
|
3127 while (w && |
|
3128 ((uintptr_t)dst & 15)) |
|
3129 { |
|
3130 s = *src++; |
|
3131 d = *dst; |
|
3132 |
|
3133 *dst++ = composite_over_8888_0565pixel (s, d); |
|
3134 w--; |
|
3135 } |
|
3136 |
|
3137 /* It's a 8 pixel loop */ |
|
3138 while (w >= 8) |
|
3139 { |
|
3140 /* I'm loading unaligned because I'm not sure |
|
3141 * about the address alignment. |
|
3142 */ |
|
3143 xmm_src = load_128_unaligned ((__m128i*) src); |
|
3144 xmm_dst = load_128_aligned ((__m128i*) dst); |
|
3145 |
|
3146 /* Unpacking */ |
|
3147 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
3148 unpack_565_128_4x128 (xmm_dst, |
|
3149 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
|
3150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
3151 &xmm_alpha_lo, &xmm_alpha_hi); |
|
3152 |
|
3153 /* I'm loading next 4 pixels from memory |
|
3154 * before to optimze the memory read. |
|
3155 */ |
|
3156 xmm_src = load_128_unaligned ((__m128i*) (src + 4)); |
|
3157 |
|
3158 over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
3159 &xmm_alpha_lo, &xmm_alpha_hi, |
|
3160 &xmm_dst0, &xmm_dst1); |
|
3161 |
|
3162 /* Unpacking */ |
|
3163 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
3164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
3165 &xmm_alpha_lo, &xmm_alpha_hi); |
|
3166 |
|
3167 over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
3168 &xmm_alpha_lo, &xmm_alpha_hi, |
|
3169 &xmm_dst2, &xmm_dst3); |
|
3170 |
|
3171 save_128_aligned ( |
|
3172 (__m128i*)dst, pack_565_4x128_128 ( |
|
3173 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
|
3174 |
|
3175 w -= 8; |
|
3176 dst += 8; |
|
3177 src += 8; |
|
3178 } |
|
3179 |
|
3180 while (w--) |
|
3181 { |
|
3182 s = *src++; |
|
3183 d = *dst; |
|
3184 |
|
3185 *dst++ = composite_over_8888_0565pixel (s, d); |
|
3186 } |
|
3187 } |
|
3188 |
|
3189 } |
|
3190 |
|
3191 static void |
|
3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, |
|
3193 pixman_composite_info_t *info) |
|
3194 { |
|
3195 PIXMAN_COMPOSITE_ARGS (info); |
|
3196 uint32_t src, srca; |
|
3197 uint32_t *dst_line, *dst; |
|
3198 uint8_t *mask_line, *mask; |
|
3199 int dst_stride, mask_stride; |
|
3200 int32_t w; |
|
3201 uint32_t m, d; |
|
3202 |
|
3203 __m128i xmm_src, xmm_alpha, xmm_def; |
|
3204 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
3205 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
3206 |
|
3207 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
|
3208 |
|
3209 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
3210 |
|
3211 srca = src >> 24; |
|
3212 if (src == 0) |
|
3213 return; |
|
3214 |
|
3215 PIXMAN_IMAGE_GET_LINE ( |
|
3216 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3217 PIXMAN_IMAGE_GET_LINE ( |
|
3218 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
3219 |
|
3220 xmm_def = create_mask_2x32_128 (src, src); |
|
3221 xmm_src = expand_pixel_32_1x128 (src); |
|
3222 xmm_alpha = expand_alpha_1x128 (xmm_src); |
|
3223 mmx_src = xmm_src; |
|
3224 mmx_alpha = xmm_alpha; |
|
3225 |
|
3226 while (height--) |
|
3227 { |
|
3228 dst = dst_line; |
|
3229 dst_line += dst_stride; |
|
3230 mask = mask_line; |
|
3231 mask_line += mask_stride; |
|
3232 w = width; |
|
3233 |
|
3234 while (w && (uintptr_t)dst & 15) |
|
3235 { |
|
3236 uint8_t m = *mask++; |
|
3237 |
|
3238 if (m) |
|
3239 { |
|
3240 d = *dst; |
|
3241 mmx_mask = expand_pixel_8_1x128 (m); |
|
3242 mmx_dest = unpack_32_1x128 (d); |
|
3243 |
|
3244 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, |
|
3245 &mmx_alpha, |
|
3246 &mmx_mask, |
|
3247 &mmx_dest)); |
|
3248 } |
|
3249 |
|
3250 w--; |
|
3251 dst++; |
|
3252 } |
|
3253 |
|
3254 while (w >= 4) |
|
3255 { |
|
3256 m = *((uint32_t*)mask); |
|
3257 |
|
3258 if (srca == 0xff && m == 0xffffffff) |
|
3259 { |
|
3260 save_128_aligned ((__m128i*)dst, xmm_def); |
|
3261 } |
|
3262 else if (m) |
|
3263 { |
|
3264 xmm_dst = load_128_aligned ((__m128i*) dst); |
|
3265 xmm_mask = unpack_32_1x128 (m); |
|
3266 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
|
3267 |
|
3268 /* Unpacking */ |
|
3269 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
3270 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
3271 |
|
3272 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
3273 &xmm_mask_lo, &xmm_mask_hi); |
|
3274 |
|
3275 in_over_2x128 (&xmm_src, &xmm_src, |
|
3276 &xmm_alpha, &xmm_alpha, |
|
3277 &xmm_mask_lo, &xmm_mask_hi, |
|
3278 &xmm_dst_lo, &xmm_dst_hi); |
|
3279 |
|
3280 save_128_aligned ( |
|
3281 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
3282 } |
|
3283 |
|
3284 w -= 4; |
|
3285 dst += 4; |
|
3286 mask += 4; |
|
3287 } |
|
3288 |
|
3289 while (w) |
|
3290 { |
|
3291 uint8_t m = *mask++; |
|
3292 |
|
3293 if (m) |
|
3294 { |
|
3295 d = *dst; |
|
3296 mmx_mask = expand_pixel_8_1x128 (m); |
|
3297 mmx_dest = unpack_32_1x128 (d); |
|
3298 |
|
3299 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, |
|
3300 &mmx_alpha, |
|
3301 &mmx_mask, |
|
3302 &mmx_dest)); |
|
3303 } |
|
3304 |
|
3305 w--; |
|
3306 dst++; |
|
3307 } |
|
3308 } |
|
3309 |
|
3310 } |
|
3311 |
|
3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) |
|
3313 __attribute__((__force_align_arg_pointer__)) |
|
3314 #endif |
|
3315 static pixman_bool_t |
|
3316 sse2_fill (pixman_implementation_t *imp, |
|
3317 uint32_t * bits, |
|
3318 int stride, |
|
3319 int bpp, |
|
3320 int x, |
|
3321 int y, |
|
3322 int width, |
|
3323 int height, |
|
3324 uint32_t filler) |
|
3325 { |
|
3326 uint32_t byte_width; |
|
3327 uint8_t *byte_line; |
|
3328 |
|
3329 __m128i xmm_def; |
|
3330 |
|
3331 if (bpp == 8) |
|
3332 { |
|
3333 uint8_t b; |
|
3334 uint16_t w; |
|
3335 |
|
3336 stride = stride * (int) sizeof (uint32_t) / 1; |
|
3337 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
|
3338 byte_width = width; |
|
3339 stride *= 1; |
|
3340 |
|
3341 b = filler & 0xff; |
|
3342 w = (b << 8) | b; |
|
3343 filler = (w << 16) | w; |
|
3344 } |
|
3345 else if (bpp == 16) |
|
3346 { |
|
3347 stride = stride * (int) sizeof (uint32_t) / 2; |
|
3348 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
|
3349 byte_width = 2 * width; |
|
3350 stride *= 2; |
|
3351 |
|
3352 filler = (filler & 0xffff) * 0x00010001; |
|
3353 } |
|
3354 else if (bpp == 32) |
|
3355 { |
|
3356 stride = stride * (int) sizeof (uint32_t) / 4; |
|
3357 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
|
3358 byte_width = 4 * width; |
|
3359 stride *= 4; |
|
3360 } |
|
3361 else |
|
3362 { |
|
3363 return FALSE; |
|
3364 } |
|
3365 |
|
3366 xmm_def = create_mask_2x32_128 (filler, filler); |
|
3367 |
|
3368 while (height--) |
|
3369 { |
|
3370 int w; |
|
3371 uint8_t *d = byte_line; |
|
3372 byte_line += stride; |
|
3373 w = byte_width; |
|
3374 |
|
3375 if (w >= 1 && ((uintptr_t)d & 1)) |
|
3376 { |
|
3377 *(uint8_t *)d = filler; |
|
3378 w -= 1; |
|
3379 d += 1; |
|
3380 } |
|
3381 |
|
3382 while (w >= 2 && ((uintptr_t)d & 3)) |
|
3383 { |
|
3384 *(uint16_t *)d = filler; |
|
3385 w -= 2; |
|
3386 d += 2; |
|
3387 } |
|
3388 |
|
3389 while (w >= 4 && ((uintptr_t)d & 15)) |
|
3390 { |
|
3391 *(uint32_t *)d = filler; |
|
3392 |
|
3393 w -= 4; |
|
3394 d += 4; |
|
3395 } |
|
3396 |
|
3397 while (w >= 128) |
|
3398 { |
|
3399 save_128_aligned ((__m128i*)(d), xmm_def); |
|
3400 save_128_aligned ((__m128i*)(d + 16), xmm_def); |
|
3401 save_128_aligned ((__m128i*)(d + 32), xmm_def); |
|
3402 save_128_aligned ((__m128i*)(d + 48), xmm_def); |
|
3403 save_128_aligned ((__m128i*)(d + 64), xmm_def); |
|
3404 save_128_aligned ((__m128i*)(d + 80), xmm_def); |
|
3405 save_128_aligned ((__m128i*)(d + 96), xmm_def); |
|
3406 save_128_aligned ((__m128i*)(d + 112), xmm_def); |
|
3407 |
|
3408 d += 128; |
|
3409 w -= 128; |
|
3410 } |
|
3411 |
|
3412 if (w >= 64) |
|
3413 { |
|
3414 save_128_aligned ((__m128i*)(d), xmm_def); |
|
3415 save_128_aligned ((__m128i*)(d + 16), xmm_def); |
|
3416 save_128_aligned ((__m128i*)(d + 32), xmm_def); |
|
3417 save_128_aligned ((__m128i*)(d + 48), xmm_def); |
|
3418 |
|
3419 d += 64; |
|
3420 w -= 64; |
|
3421 } |
|
3422 |
|
3423 if (w >= 32) |
|
3424 { |
|
3425 save_128_aligned ((__m128i*)(d), xmm_def); |
|
3426 save_128_aligned ((__m128i*)(d + 16), xmm_def); |
|
3427 |
|
3428 d += 32; |
|
3429 w -= 32; |
|
3430 } |
|
3431 |
|
3432 if (w >= 16) |
|
3433 { |
|
3434 save_128_aligned ((__m128i*)(d), xmm_def); |
|
3435 |
|
3436 d += 16; |
|
3437 w -= 16; |
|
3438 } |
|
3439 |
|
3440 while (w >= 4) |
|
3441 { |
|
3442 *(uint32_t *)d = filler; |
|
3443 |
|
3444 w -= 4; |
|
3445 d += 4; |
|
3446 } |
|
3447 |
|
3448 if (w >= 2) |
|
3449 { |
|
3450 *(uint16_t *)d = filler; |
|
3451 w -= 2; |
|
3452 d += 2; |
|
3453 } |
|
3454 |
|
3455 if (w >= 1) |
|
3456 { |
|
3457 *(uint8_t *)d = filler; |
|
3458 w -= 1; |
|
3459 d += 1; |
|
3460 } |
|
3461 } |
|
3462 |
|
3463 return TRUE; |
|
3464 } |
|
3465 |
|
3466 static void |
|
3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, |
|
3468 pixman_composite_info_t *info) |
|
3469 { |
|
3470 PIXMAN_COMPOSITE_ARGS (info); |
|
3471 uint32_t src, srca; |
|
3472 uint32_t *dst_line, *dst; |
|
3473 uint8_t *mask_line, *mask; |
|
3474 int dst_stride, mask_stride; |
|
3475 int32_t w; |
|
3476 uint32_t m; |
|
3477 |
|
3478 __m128i xmm_src, xmm_def; |
|
3479 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
3480 |
|
3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
3482 |
|
3483 srca = src >> 24; |
|
3484 if (src == 0) |
|
3485 { |
|
3486 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
|
3487 PIXMAN_FORMAT_BPP (dest_image->bits.format), |
|
3488 dest_x, dest_y, width, height, 0); |
|
3489 return; |
|
3490 } |
|
3491 |
|
3492 PIXMAN_IMAGE_GET_LINE ( |
|
3493 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3494 PIXMAN_IMAGE_GET_LINE ( |
|
3495 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
3496 |
|
3497 xmm_def = create_mask_2x32_128 (src, src); |
|
3498 xmm_src = expand_pixel_32_1x128 (src); |
|
3499 |
|
3500 while (height--) |
|
3501 { |
|
3502 dst = dst_line; |
|
3503 dst_line += dst_stride; |
|
3504 mask = mask_line; |
|
3505 mask_line += mask_stride; |
|
3506 w = width; |
|
3507 |
|
3508 while (w && (uintptr_t)dst & 15) |
|
3509 { |
|
3510 uint8_t m = *mask++; |
|
3511 |
|
3512 if (m) |
|
3513 { |
|
3514 *dst = pack_1x128_32 ( |
|
3515 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); |
|
3516 } |
|
3517 else |
|
3518 { |
|
3519 *dst = 0; |
|
3520 } |
|
3521 |
|
3522 w--; |
|
3523 dst++; |
|
3524 } |
|
3525 |
|
3526 while (w >= 4) |
|
3527 { |
|
3528 m = *((uint32_t*)mask); |
|
3529 |
|
3530 if (srca == 0xff && m == 0xffffffff) |
|
3531 { |
|
3532 save_128_aligned ((__m128i*)dst, xmm_def); |
|
3533 } |
|
3534 else if (m) |
|
3535 { |
|
3536 xmm_mask = unpack_32_1x128 (m); |
|
3537 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
|
3538 |
|
3539 /* Unpacking */ |
|
3540 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
3541 |
|
3542 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
3543 &xmm_mask_lo, &xmm_mask_hi); |
|
3544 |
|
3545 pix_multiply_2x128 (&xmm_src, &xmm_src, |
|
3546 &xmm_mask_lo, &xmm_mask_hi, |
|
3547 &xmm_mask_lo, &xmm_mask_hi); |
|
3548 |
|
3549 save_128_aligned ( |
|
3550 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); |
|
3551 } |
|
3552 else |
|
3553 { |
|
3554 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); |
|
3555 } |
|
3556 |
|
3557 w -= 4; |
|
3558 dst += 4; |
|
3559 mask += 4; |
|
3560 } |
|
3561 |
|
3562 while (w) |
|
3563 { |
|
3564 uint8_t m = *mask++; |
|
3565 |
|
3566 if (m) |
|
3567 { |
|
3568 *dst = pack_1x128_32 ( |
|
3569 pix_multiply_1x128 ( |
|
3570 xmm_src, expand_pixel_8_1x128 (m))); |
|
3571 } |
|
3572 else |
|
3573 { |
|
3574 *dst = 0; |
|
3575 } |
|
3576 |
|
3577 w--; |
|
3578 dst++; |
|
3579 } |
|
3580 } |
|
3581 |
|
3582 } |
|
3583 |
|
3584 static void |
|
3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, |
|
3586 pixman_composite_info_t *info) |
|
3587 { |
|
3588 PIXMAN_COMPOSITE_ARGS (info); |
|
3589 uint32_t src; |
|
3590 uint16_t *dst_line, *dst, d; |
|
3591 uint8_t *mask_line, *mask; |
|
3592 int dst_stride, mask_stride; |
|
3593 int32_t w; |
|
3594 uint32_t m; |
|
3595 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
|
3596 |
|
3597 __m128i xmm_src, xmm_alpha; |
|
3598 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
3599 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
|
3600 |
|
3601 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
3602 |
|
3603 if (src == 0) |
|
3604 return; |
|
3605 |
|
3606 PIXMAN_IMAGE_GET_LINE ( |
|
3607 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
3608 PIXMAN_IMAGE_GET_LINE ( |
|
3609 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
3610 |
|
3611 xmm_src = expand_pixel_32_1x128 (src); |
|
3612 xmm_alpha = expand_alpha_1x128 (xmm_src); |
|
3613 mmx_src = xmm_src; |
|
3614 mmx_alpha = xmm_alpha; |
|
3615 |
|
3616 while (height--) |
|
3617 { |
|
3618 dst = dst_line; |
|
3619 dst_line += dst_stride; |
|
3620 mask = mask_line; |
|
3621 mask_line += mask_stride; |
|
3622 w = width; |
|
3623 |
|
3624 while (w && (uintptr_t)dst & 15) |
|
3625 { |
|
3626 m = *mask++; |
|
3627 |
|
3628 if (m) |
|
3629 { |
|
3630 d = *dst; |
|
3631 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
|
3632 mmx_dest = expand565_16_1x128 (d); |
|
3633 |
|
3634 *dst = pack_565_32_16 ( |
|
3635 pack_1x128_32 ( |
|
3636 in_over_1x128 ( |
|
3637 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
|
3638 } |
|
3639 |
|
3640 w--; |
|
3641 dst++; |
|
3642 } |
|
3643 |
|
3644 while (w >= 8) |
|
3645 { |
|
3646 xmm_dst = load_128_aligned ((__m128i*) dst); |
|
3647 unpack_565_128_4x128 (xmm_dst, |
|
3648 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
|
3649 |
|
3650 m = *((uint32_t*)mask); |
|
3651 mask += 4; |
|
3652 |
|
3653 if (m) |
|
3654 { |
|
3655 xmm_mask = unpack_32_1x128 (m); |
|
3656 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
|
3657 |
|
3658 /* Unpacking */ |
|
3659 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
3660 |
|
3661 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
3662 &xmm_mask_lo, &xmm_mask_hi); |
|
3663 |
|
3664 in_over_2x128 (&xmm_src, &xmm_src, |
|
3665 &xmm_alpha, &xmm_alpha, |
|
3666 &xmm_mask_lo, &xmm_mask_hi, |
|
3667 &xmm_dst0, &xmm_dst1); |
|
3668 } |
|
3669 |
|
3670 m = *((uint32_t*)mask); |
|
3671 mask += 4; |
|
3672 |
|
3673 if (m) |
|
3674 { |
|
3675 xmm_mask = unpack_32_1x128 (m); |
|
3676 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
|
3677 |
|
3678 /* Unpacking */ |
|
3679 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
3680 |
|
3681 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
3682 &xmm_mask_lo, &xmm_mask_hi); |
|
3683 in_over_2x128 (&xmm_src, &xmm_src, |
|
3684 &xmm_alpha, &xmm_alpha, |
|
3685 &xmm_mask_lo, &xmm_mask_hi, |
|
3686 &xmm_dst2, &xmm_dst3); |
|
3687 } |
|
3688 |
|
3689 save_128_aligned ( |
|
3690 (__m128i*)dst, pack_565_4x128_128 ( |
|
3691 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
|
3692 |
|
3693 w -= 8; |
|
3694 dst += 8; |
|
3695 } |
|
3696 |
|
3697 while (w) |
|
3698 { |
|
3699 m = *mask++; |
|
3700 |
|
3701 if (m) |
|
3702 { |
|
3703 d = *dst; |
|
3704 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
|
3705 mmx_dest = expand565_16_1x128 (d); |
|
3706 |
|
3707 *dst = pack_565_32_16 ( |
|
3708 pack_1x128_32 ( |
|
3709 in_over_1x128 ( |
|
3710 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
|
3711 } |
|
3712 |
|
3713 w--; |
|
3714 dst++; |
|
3715 } |
|
3716 } |
|
3717 |
|
3718 } |
|
3719 |
|
3720 static void |
|
3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
|
3722 pixman_composite_info_t *info) |
|
3723 { |
|
3724 PIXMAN_COMPOSITE_ARGS (info); |
|
3725 uint16_t *dst_line, *dst, d; |
|
3726 uint32_t *src_line, *src, s; |
|
3727 int dst_stride, src_stride; |
|
3728 int32_t w; |
|
3729 uint32_t opaque, zero; |
|
3730 |
|
3731 __m128i ms; |
|
3732 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
3733 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
|
3734 |
|
3735 PIXMAN_IMAGE_GET_LINE ( |
|
3736 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
3737 PIXMAN_IMAGE_GET_LINE ( |
|
3738 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
3739 |
|
3740 while (height--) |
|
3741 { |
|
3742 dst = dst_line; |
|
3743 dst_line += dst_stride; |
|
3744 src = src_line; |
|
3745 src_line += src_stride; |
|
3746 w = width; |
|
3747 |
|
3748 while (w && (uintptr_t)dst & 15) |
|
3749 { |
|
3750 s = *src++; |
|
3751 d = *dst; |
|
3752 |
|
3753 ms = unpack_32_1x128 (s); |
|
3754 |
|
3755 *dst++ = pack_565_32_16 ( |
|
3756 pack_1x128_32 ( |
|
3757 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); |
|
3758 w--; |
|
3759 } |
|
3760 |
|
3761 while (w >= 8) |
|
3762 { |
|
3763 /* First round */ |
|
3764 xmm_src = load_128_unaligned ((__m128i*)src); |
|
3765 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
3766 |
|
3767 opaque = is_opaque (xmm_src); |
|
3768 zero = is_zero (xmm_src); |
|
3769 |
|
3770 unpack_565_128_4x128 (xmm_dst, |
|
3771 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
|
3772 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
3773 |
|
3774 /* preload next round*/ |
|
3775 xmm_src = load_128_unaligned ((__m128i*)(src + 4)); |
|
3776 |
|
3777 if (opaque) |
|
3778 { |
|
3779 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
|
3780 &xmm_dst0, &xmm_dst1); |
|
3781 } |
|
3782 else if (!zero) |
|
3783 { |
|
3784 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
|
3785 &xmm_dst0, &xmm_dst1); |
|
3786 } |
|
3787 |
|
3788 /* Second round */ |
|
3789 opaque = is_opaque (xmm_src); |
|
3790 zero = is_zero (xmm_src); |
|
3791 |
|
3792 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
3793 |
|
3794 if (opaque) |
|
3795 { |
|
3796 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
|
3797 &xmm_dst2, &xmm_dst3); |
|
3798 } |
|
3799 else if (!zero) |
|
3800 { |
|
3801 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
|
3802 &xmm_dst2, &xmm_dst3); |
|
3803 } |
|
3804 |
|
3805 save_128_aligned ( |
|
3806 (__m128i*)dst, pack_565_4x128_128 ( |
|
3807 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
|
3808 |
|
3809 w -= 8; |
|
3810 src += 8; |
|
3811 dst += 8; |
|
3812 } |
|
3813 |
|
3814 while (w) |
|
3815 { |
|
3816 s = *src++; |
|
3817 d = *dst; |
|
3818 |
|
3819 ms = unpack_32_1x128 (s); |
|
3820 |
|
3821 *dst++ = pack_565_32_16 ( |
|
3822 pack_1x128_32 ( |
|
3823 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); |
|
3824 w--; |
|
3825 } |
|
3826 } |
|
3827 |
|
3828 } |
|
3829 |
|
3830 static void |
|
3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
|
3832 pixman_composite_info_t *info) |
|
3833 { |
|
3834 PIXMAN_COMPOSITE_ARGS (info); |
|
3835 uint32_t *dst_line, *dst, d; |
|
3836 uint32_t *src_line, *src, s; |
|
3837 int dst_stride, src_stride; |
|
3838 int32_t w; |
|
3839 uint32_t opaque, zero; |
|
3840 |
|
3841 __m128i xmm_src_lo, xmm_src_hi; |
|
3842 __m128i xmm_dst_lo, xmm_dst_hi; |
|
3843 |
|
3844 PIXMAN_IMAGE_GET_LINE ( |
|
3845 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3846 PIXMAN_IMAGE_GET_LINE ( |
|
3847 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
3848 |
|
3849 while (height--) |
|
3850 { |
|
3851 dst = dst_line; |
|
3852 dst_line += dst_stride; |
|
3853 src = src_line; |
|
3854 src_line += src_stride; |
|
3855 w = width; |
|
3856 |
|
3857 while (w && (uintptr_t)dst & 15) |
|
3858 { |
|
3859 s = *src++; |
|
3860 d = *dst; |
|
3861 |
|
3862 *dst++ = pack_1x128_32 ( |
|
3863 over_rev_non_pre_1x128 ( |
|
3864 unpack_32_1x128 (s), unpack_32_1x128 (d))); |
|
3865 |
|
3866 w--; |
|
3867 } |
|
3868 |
|
3869 while (w >= 4) |
|
3870 { |
|
3871 xmm_src_hi = load_128_unaligned ((__m128i*)src); |
|
3872 |
|
3873 opaque = is_opaque (xmm_src_hi); |
|
3874 zero = is_zero (xmm_src_hi); |
|
3875 |
|
3876 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
3877 |
|
3878 if (opaque) |
|
3879 { |
|
3880 invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
|
3881 &xmm_dst_lo, &xmm_dst_hi); |
|
3882 |
|
3883 save_128_aligned ( |
|
3884 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
3885 } |
|
3886 else if (!zero) |
|
3887 { |
|
3888 xmm_dst_hi = load_128_aligned ((__m128i*)dst); |
|
3889 |
|
3890 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
3891 |
|
3892 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
|
3893 &xmm_dst_lo, &xmm_dst_hi); |
|
3894 |
|
3895 save_128_aligned ( |
|
3896 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
3897 } |
|
3898 |
|
3899 w -= 4; |
|
3900 dst += 4; |
|
3901 src += 4; |
|
3902 } |
|
3903 |
|
3904 while (w) |
|
3905 { |
|
3906 s = *src++; |
|
3907 d = *dst; |
|
3908 |
|
3909 *dst++ = pack_1x128_32 ( |
|
3910 over_rev_non_pre_1x128 ( |
|
3911 unpack_32_1x128 (s), unpack_32_1x128 (d))); |
|
3912 |
|
3913 w--; |
|
3914 } |
|
3915 } |
|
3916 |
|
3917 } |
|
3918 |
|
3919 static void |
|
3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
|
3921 pixman_composite_info_t *info) |
|
3922 { |
|
3923 PIXMAN_COMPOSITE_ARGS (info); |
|
3924 uint32_t src; |
|
3925 uint16_t *dst_line, *dst, d; |
|
3926 uint32_t *mask_line, *mask, m; |
|
3927 int dst_stride, mask_stride; |
|
3928 int w; |
|
3929 uint32_t pack_cmp; |
|
3930 |
|
3931 __m128i xmm_src, xmm_alpha; |
|
3932 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
3933 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
|
3934 |
|
3935 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
|
3936 |
|
3937 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
3938 |
|
3939 if (src == 0) |
|
3940 return; |
|
3941 |
|
3942 PIXMAN_IMAGE_GET_LINE ( |
|
3943 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
3944 PIXMAN_IMAGE_GET_LINE ( |
|
3945 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
|
3946 |
|
3947 xmm_src = expand_pixel_32_1x128 (src); |
|
3948 xmm_alpha = expand_alpha_1x128 (xmm_src); |
|
3949 mmx_src = xmm_src; |
|
3950 mmx_alpha = xmm_alpha; |
|
3951 |
|
3952 while (height--) |
|
3953 { |
|
3954 w = width; |
|
3955 mask = mask_line; |
|
3956 dst = dst_line; |
|
3957 mask_line += mask_stride; |
|
3958 dst_line += dst_stride; |
|
3959 |
|
3960 while (w && ((uintptr_t)dst & 15)) |
|
3961 { |
|
3962 m = *(uint32_t *) mask; |
|
3963 |
|
3964 if (m) |
|
3965 { |
|
3966 d = *dst; |
|
3967 mmx_mask = unpack_32_1x128 (m); |
|
3968 mmx_dest = expand565_16_1x128 (d); |
|
3969 |
|
3970 *dst = pack_565_32_16 ( |
|
3971 pack_1x128_32 ( |
|
3972 in_over_1x128 ( |
|
3973 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
|
3974 } |
|
3975 |
|
3976 w--; |
|
3977 dst++; |
|
3978 mask++; |
|
3979 } |
|
3980 |
|
3981 while (w >= 8) |
|
3982 { |
|
3983 /* First round */ |
|
3984 xmm_mask = load_128_unaligned ((__m128i*)mask); |
|
3985 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
3986 |
|
3987 pack_cmp = _mm_movemask_epi8 ( |
|
3988 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
|
3989 |
|
3990 unpack_565_128_4x128 (xmm_dst, |
|
3991 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
|
3992 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
3993 |
|
3994 /* preload next round */ |
|
3995 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); |
|
3996 |
|
3997 /* preload next round */ |
|
3998 if (pack_cmp != 0xffff) |
|
3999 { |
|
4000 in_over_2x128 (&xmm_src, &xmm_src, |
|
4001 &xmm_alpha, &xmm_alpha, |
|
4002 &xmm_mask_lo, &xmm_mask_hi, |
|
4003 &xmm_dst0, &xmm_dst1); |
|
4004 } |
|
4005 |
|
4006 /* Second round */ |
|
4007 pack_cmp = _mm_movemask_epi8 ( |
|
4008 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
|
4009 |
|
4010 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
4011 |
|
4012 if (pack_cmp != 0xffff) |
|
4013 { |
|
4014 in_over_2x128 (&xmm_src, &xmm_src, |
|
4015 &xmm_alpha, &xmm_alpha, |
|
4016 &xmm_mask_lo, &xmm_mask_hi, |
|
4017 &xmm_dst2, &xmm_dst3); |
|
4018 } |
|
4019 |
|
4020 save_128_aligned ( |
|
4021 (__m128i*)dst, pack_565_4x128_128 ( |
|
4022 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
|
4023 |
|
4024 w -= 8; |
|
4025 dst += 8; |
|
4026 mask += 8; |
|
4027 } |
|
4028 |
|
4029 while (w) |
|
4030 { |
|
4031 m = *(uint32_t *) mask; |
|
4032 |
|
4033 if (m) |
|
4034 { |
|
4035 d = *dst; |
|
4036 mmx_mask = unpack_32_1x128 (m); |
|
4037 mmx_dest = expand565_16_1x128 (d); |
|
4038 |
|
4039 *dst = pack_565_32_16 ( |
|
4040 pack_1x128_32 ( |
|
4041 in_over_1x128 ( |
|
4042 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
|
4043 } |
|
4044 |
|
4045 w--; |
|
4046 dst++; |
|
4047 mask++; |
|
4048 } |
|
4049 } |
|
4050 |
|
4051 } |
|
4052 |
|
4053 static void |
|
4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp, |
|
4055 pixman_composite_info_t *info) |
|
4056 { |
|
4057 PIXMAN_COMPOSITE_ARGS (info); |
|
4058 uint8_t *dst_line, *dst; |
|
4059 uint8_t *mask_line, *mask; |
|
4060 int dst_stride, mask_stride; |
|
4061 uint32_t d, m; |
|
4062 uint32_t src; |
|
4063 int32_t w; |
|
4064 |
|
4065 __m128i xmm_alpha; |
|
4066 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
4067 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
4068 |
|
4069 PIXMAN_IMAGE_GET_LINE ( |
|
4070 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
4071 PIXMAN_IMAGE_GET_LINE ( |
|
4072 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
4073 |
|
4074 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
4075 |
|
4076 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
|
4077 |
|
4078 while (height--) |
|
4079 { |
|
4080 dst = dst_line; |
|
4081 dst_line += dst_stride; |
|
4082 mask = mask_line; |
|
4083 mask_line += mask_stride; |
|
4084 w = width; |
|
4085 |
|
4086 while (w && ((uintptr_t)dst & 15)) |
|
4087 { |
|
4088 m = (uint32_t) *mask++; |
|
4089 d = (uint32_t) *dst; |
|
4090 |
|
4091 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4092 pix_multiply_1x128 ( |
|
4093 pix_multiply_1x128 (xmm_alpha, |
|
4094 unpack_32_1x128 (m)), |
|
4095 unpack_32_1x128 (d))); |
|
4096 w--; |
|
4097 } |
|
4098 |
|
4099 while (w >= 16) |
|
4100 { |
|
4101 xmm_mask = load_128_unaligned ((__m128i*)mask); |
|
4102 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
4103 |
|
4104 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
4105 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
4106 |
|
4107 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
|
4108 &xmm_mask_lo, &xmm_mask_hi, |
|
4109 &xmm_mask_lo, &xmm_mask_hi); |
|
4110 |
|
4111 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
|
4112 &xmm_dst_lo, &xmm_dst_hi, |
|
4113 &xmm_dst_lo, &xmm_dst_hi); |
|
4114 |
|
4115 save_128_aligned ( |
|
4116 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
4117 |
|
4118 mask += 16; |
|
4119 dst += 16; |
|
4120 w -= 16; |
|
4121 } |
|
4122 |
|
4123 while (w) |
|
4124 { |
|
4125 m = (uint32_t) *mask++; |
|
4126 d = (uint32_t) *dst; |
|
4127 |
|
4128 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4129 pix_multiply_1x128 ( |
|
4130 pix_multiply_1x128 ( |
|
4131 xmm_alpha, unpack_32_1x128 (m)), |
|
4132 unpack_32_1x128 (d))); |
|
4133 w--; |
|
4134 } |
|
4135 } |
|
4136 |
|
4137 } |
|
4138 |
|
4139 static void |
|
4140 sse2_composite_in_n_8 (pixman_implementation_t *imp, |
|
4141 pixman_composite_info_t *info) |
|
4142 { |
|
4143 PIXMAN_COMPOSITE_ARGS (info); |
|
4144 uint8_t *dst_line, *dst; |
|
4145 int dst_stride; |
|
4146 uint32_t d; |
|
4147 uint32_t src; |
|
4148 int32_t w; |
|
4149 |
|
4150 __m128i xmm_alpha; |
|
4151 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
4152 |
|
4153 PIXMAN_IMAGE_GET_LINE ( |
|
4154 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
4155 |
|
4156 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
4157 |
|
4158 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
|
4159 |
|
4160 src = src >> 24; |
|
4161 |
|
4162 if (src == 0xff) |
|
4163 return; |
|
4164 |
|
4165 if (src == 0x00) |
|
4166 { |
|
4167 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, |
|
4168 8, dest_x, dest_y, width, height, src); |
|
4169 |
|
4170 return; |
|
4171 } |
|
4172 |
|
4173 while (height--) |
|
4174 { |
|
4175 dst = dst_line; |
|
4176 dst_line += dst_stride; |
|
4177 w = width; |
|
4178 |
|
4179 while (w && ((uintptr_t)dst & 15)) |
|
4180 { |
|
4181 d = (uint32_t) *dst; |
|
4182 |
|
4183 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4184 pix_multiply_1x128 ( |
|
4185 xmm_alpha, |
|
4186 unpack_32_1x128 (d))); |
|
4187 w--; |
|
4188 } |
|
4189 |
|
4190 while (w >= 16) |
|
4191 { |
|
4192 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
4193 |
|
4194 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
4195 |
|
4196 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
|
4197 &xmm_dst_lo, &xmm_dst_hi, |
|
4198 &xmm_dst_lo, &xmm_dst_hi); |
|
4199 |
|
4200 save_128_aligned ( |
|
4201 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
4202 |
|
4203 dst += 16; |
|
4204 w -= 16; |
|
4205 } |
|
4206 |
|
4207 while (w) |
|
4208 { |
|
4209 d = (uint32_t) *dst; |
|
4210 |
|
4211 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4212 pix_multiply_1x128 ( |
|
4213 xmm_alpha, |
|
4214 unpack_32_1x128 (d))); |
|
4215 w--; |
|
4216 } |
|
4217 } |
|
4218 |
|
4219 } |
|
4220 |
|
4221 static void |
|
4222 sse2_composite_in_8_8 (pixman_implementation_t *imp, |
|
4223 pixman_composite_info_t *info) |
|
4224 { |
|
4225 PIXMAN_COMPOSITE_ARGS (info); |
|
4226 uint8_t *dst_line, *dst; |
|
4227 uint8_t *src_line, *src; |
|
4228 int src_stride, dst_stride; |
|
4229 int32_t w; |
|
4230 uint32_t s, d; |
|
4231 |
|
4232 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
4233 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
4234 |
|
4235 PIXMAN_IMAGE_GET_LINE ( |
|
4236 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
4237 PIXMAN_IMAGE_GET_LINE ( |
|
4238 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
|
4239 |
|
4240 while (height--) |
|
4241 { |
|
4242 dst = dst_line; |
|
4243 dst_line += dst_stride; |
|
4244 src = src_line; |
|
4245 src_line += src_stride; |
|
4246 w = width; |
|
4247 |
|
4248 while (w && ((uintptr_t)dst & 15)) |
|
4249 { |
|
4250 s = (uint32_t) *src++; |
|
4251 d = (uint32_t) *dst; |
|
4252 |
|
4253 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4254 pix_multiply_1x128 ( |
|
4255 unpack_32_1x128 (s), unpack_32_1x128 (d))); |
|
4256 w--; |
|
4257 } |
|
4258 |
|
4259 while (w >= 16) |
|
4260 { |
|
4261 xmm_src = load_128_unaligned ((__m128i*)src); |
|
4262 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
4263 |
|
4264 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
4265 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
4266 |
|
4267 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
4268 &xmm_dst_lo, &xmm_dst_hi, |
|
4269 &xmm_dst_lo, &xmm_dst_hi); |
|
4270 |
|
4271 save_128_aligned ( |
|
4272 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
4273 |
|
4274 src += 16; |
|
4275 dst += 16; |
|
4276 w -= 16; |
|
4277 } |
|
4278 |
|
4279 while (w) |
|
4280 { |
|
4281 s = (uint32_t) *src++; |
|
4282 d = (uint32_t) *dst; |
|
4283 |
|
4284 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4285 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); |
|
4286 w--; |
|
4287 } |
|
4288 } |
|
4289 |
|
4290 } |
|
4291 |
|
4292 static void |
|
4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp, |
|
4294 pixman_composite_info_t *info) |
|
4295 { |
|
4296 PIXMAN_COMPOSITE_ARGS (info); |
|
4297 uint8_t *dst_line, *dst; |
|
4298 uint8_t *mask_line, *mask; |
|
4299 int dst_stride, mask_stride; |
|
4300 int32_t w; |
|
4301 uint32_t src; |
|
4302 uint32_t m, d; |
|
4303 |
|
4304 __m128i xmm_alpha; |
|
4305 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
4306 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
4307 |
|
4308 PIXMAN_IMAGE_GET_LINE ( |
|
4309 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
4310 PIXMAN_IMAGE_GET_LINE ( |
|
4311 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
4312 |
|
4313 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
4314 |
|
4315 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
|
4316 |
|
4317 while (height--) |
|
4318 { |
|
4319 dst = dst_line; |
|
4320 dst_line += dst_stride; |
|
4321 mask = mask_line; |
|
4322 mask_line += mask_stride; |
|
4323 w = width; |
|
4324 |
|
4325 while (w && ((uintptr_t)dst & 15)) |
|
4326 { |
|
4327 m = (uint32_t) *mask++; |
|
4328 d = (uint32_t) *dst; |
|
4329 |
|
4330 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4331 _mm_adds_epu16 ( |
|
4332 pix_multiply_1x128 ( |
|
4333 xmm_alpha, unpack_32_1x128 (m)), |
|
4334 unpack_32_1x128 (d))); |
|
4335 w--; |
|
4336 } |
|
4337 |
|
4338 while (w >= 16) |
|
4339 { |
|
4340 xmm_mask = load_128_unaligned ((__m128i*)mask); |
|
4341 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
4342 |
|
4343 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
4344 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
4345 |
|
4346 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
|
4347 &xmm_mask_lo, &xmm_mask_hi, |
|
4348 &xmm_mask_lo, &xmm_mask_hi); |
|
4349 |
|
4350 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); |
|
4351 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); |
|
4352 |
|
4353 save_128_aligned ( |
|
4354 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
4355 |
|
4356 mask += 16; |
|
4357 dst += 16; |
|
4358 w -= 16; |
|
4359 } |
|
4360 |
|
4361 while (w) |
|
4362 { |
|
4363 m = (uint32_t) *mask++; |
|
4364 d = (uint32_t) *dst; |
|
4365 |
|
4366 *dst++ = (uint8_t) pack_1x128_32 ( |
|
4367 _mm_adds_epu16 ( |
|
4368 pix_multiply_1x128 ( |
|
4369 xmm_alpha, unpack_32_1x128 (m)), |
|
4370 unpack_32_1x128 (d))); |
|
4371 |
|
4372 w--; |
|
4373 } |
|
4374 } |
|
4375 |
|
4376 } |
|
4377 |
|
4378 static void |
|
4379 sse2_composite_add_n_8 (pixman_implementation_t *imp, |
|
4380 pixman_composite_info_t *info) |
|
4381 { |
|
4382 PIXMAN_COMPOSITE_ARGS (info); |
|
4383 uint8_t *dst_line, *dst; |
|
4384 int dst_stride; |
|
4385 int32_t w; |
|
4386 uint32_t src; |
|
4387 |
|
4388 __m128i xmm_src; |
|
4389 |
|
4390 PIXMAN_IMAGE_GET_LINE ( |
|
4391 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
4392 |
|
4393 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
4394 |
|
4395 src >>= 24; |
|
4396 |
|
4397 if (src == 0x00) |
|
4398 return; |
|
4399 |
|
4400 if (src == 0xff) |
|
4401 { |
|
4402 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, |
|
4403 8, dest_x, dest_y, width, height, 0xff); |
|
4404 |
|
4405 return; |
|
4406 } |
|
4407 |
|
4408 src = (src << 24) | (src << 16) | (src << 8) | src; |
|
4409 xmm_src = _mm_set_epi32 (src, src, src, src); |
|
4410 |
|
4411 while (height--) |
|
4412 { |
|
4413 dst = dst_line; |
|
4414 dst_line += dst_stride; |
|
4415 w = width; |
|
4416 |
|
4417 while (w && ((uintptr_t)dst & 15)) |
|
4418 { |
|
4419 *dst = (uint8_t)_mm_cvtsi128_si32 ( |
|
4420 _mm_adds_epu8 ( |
|
4421 xmm_src, |
|
4422 _mm_cvtsi32_si128 (*dst))); |
|
4423 |
|
4424 w--; |
|
4425 dst++; |
|
4426 } |
|
4427 |
|
4428 while (w >= 16) |
|
4429 { |
|
4430 save_128_aligned ( |
|
4431 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); |
|
4432 |
|
4433 dst += 16; |
|
4434 w -= 16; |
|
4435 } |
|
4436 |
|
4437 while (w) |
|
4438 { |
|
4439 *dst = (uint8_t)_mm_cvtsi128_si32 ( |
|
4440 _mm_adds_epu8 ( |
|
4441 xmm_src, |
|
4442 _mm_cvtsi32_si128 (*dst))); |
|
4443 |
|
4444 w--; |
|
4445 dst++; |
|
4446 } |
|
4447 } |
|
4448 |
|
4449 } |
|
4450 |
|
4451 static void |
|
4452 sse2_composite_add_8_8 (pixman_implementation_t *imp, |
|
4453 pixman_composite_info_t *info) |
|
4454 { |
|
4455 PIXMAN_COMPOSITE_ARGS (info); |
|
4456 uint8_t *dst_line, *dst; |
|
4457 uint8_t *src_line, *src; |
|
4458 int dst_stride, src_stride; |
|
4459 int32_t w; |
|
4460 uint16_t t; |
|
4461 |
|
4462 PIXMAN_IMAGE_GET_LINE ( |
|
4463 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
|
4464 PIXMAN_IMAGE_GET_LINE ( |
|
4465 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
4466 |
|
4467 while (height--) |
|
4468 { |
|
4469 dst = dst_line; |
|
4470 src = src_line; |
|
4471 |
|
4472 dst_line += dst_stride; |
|
4473 src_line += src_stride; |
|
4474 w = width; |
|
4475 |
|
4476 /* Small head */ |
|
4477 while (w && (uintptr_t)dst & 3) |
|
4478 { |
|
4479 t = (*dst) + (*src++); |
|
4480 *dst++ = t | (0 - (t >> 8)); |
|
4481 w--; |
|
4482 } |
|
4483 |
|
4484 sse2_combine_add_u (imp, op, |
|
4485 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); |
|
4486 |
|
4487 /* Small tail */ |
|
4488 dst += w & 0xfffc; |
|
4489 src += w & 0xfffc; |
|
4490 |
|
4491 w &= 3; |
|
4492 |
|
4493 while (w) |
|
4494 { |
|
4495 t = (*dst) + (*src++); |
|
4496 *dst++ = t | (0 - (t >> 8)); |
|
4497 w--; |
|
4498 } |
|
4499 } |
|
4500 |
|
4501 } |
|
4502 |
|
4503 static void |
|
4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp, |
|
4505 pixman_composite_info_t *info) |
|
4506 { |
|
4507 PIXMAN_COMPOSITE_ARGS (info); |
|
4508 uint32_t *dst_line, *dst; |
|
4509 uint32_t *src_line, *src; |
|
4510 int dst_stride, src_stride; |
|
4511 |
|
4512 PIXMAN_IMAGE_GET_LINE ( |
|
4513 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
4514 PIXMAN_IMAGE_GET_LINE ( |
|
4515 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
4516 |
|
4517 while (height--) |
|
4518 { |
|
4519 dst = dst_line; |
|
4520 dst_line += dst_stride; |
|
4521 src = src_line; |
|
4522 src_line += src_stride; |
|
4523 |
|
4524 sse2_combine_add_u (imp, op, dst, src, NULL, width); |
|
4525 } |
|
4526 } |
|
4527 |
|
4528 static void |
|
4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp, |
|
4530 pixman_composite_info_t *info) |
|
4531 { |
|
4532 PIXMAN_COMPOSITE_ARGS (info); |
|
4533 uint32_t *dst_line, *dst, src; |
|
4534 int dst_stride; |
|
4535 |
|
4536 __m128i xmm_src; |
|
4537 |
|
4538 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
4539 |
|
4540 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
4541 if (src == 0) |
|
4542 return; |
|
4543 |
|
4544 if (src == ~0) |
|
4545 { |
|
4546 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, |
|
4547 dest_x, dest_y, width, height, ~0); |
|
4548 |
|
4549 return; |
|
4550 } |
|
4551 |
|
4552 xmm_src = _mm_set_epi32 (src, src, src, src); |
|
4553 while (height--) |
|
4554 { |
|
4555 int w = width; |
|
4556 uint32_t d; |
|
4557 |
|
4558 dst = dst_line; |
|
4559 dst_line += dst_stride; |
|
4560 |
|
4561 while (w && (unsigned long)dst & 15) |
|
4562 { |
|
4563 d = *dst; |
|
4564 *dst++ = |
|
4565 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); |
|
4566 w--; |
|
4567 } |
|
4568 |
|
4569 while (w >= 4) |
|
4570 { |
|
4571 save_128_aligned |
|
4572 ((__m128i*)dst, |
|
4573 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); |
|
4574 |
|
4575 dst += 4; |
|
4576 w -= 4; |
|
4577 } |
|
4578 |
|
4579 while (w--) |
|
4580 { |
|
4581 d = *dst; |
|
4582 *dst++ = |
|
4583 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, |
|
4584 _mm_cvtsi32_si128 (d))); |
|
4585 } |
|
4586 } |
|
4587 } |
|
4588 |
|
4589 static void |
|
4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, |
|
4591 pixman_composite_info_t *info) |
|
4592 { |
|
4593 PIXMAN_COMPOSITE_ARGS (info); |
|
4594 uint32_t *dst_line, *dst; |
|
4595 uint8_t *mask_line, *mask; |
|
4596 int dst_stride, mask_stride; |
|
4597 int32_t w; |
|
4598 uint32_t src; |
|
4599 |
|
4600 __m128i xmm_src; |
|
4601 |
|
4602 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
4603 if (src == 0) |
|
4604 return; |
|
4605 xmm_src = expand_pixel_32_1x128 (src); |
|
4606 |
|
4607 PIXMAN_IMAGE_GET_LINE ( |
|
4608 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
4609 PIXMAN_IMAGE_GET_LINE ( |
|
4610 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
4611 |
|
4612 while (height--) |
|
4613 { |
|
4614 dst = dst_line; |
|
4615 dst_line += dst_stride; |
|
4616 mask = mask_line; |
|
4617 mask_line += mask_stride; |
|
4618 w = width; |
|
4619 |
|
4620 while (w && ((unsigned long)dst & 15)) |
|
4621 { |
|
4622 uint8_t m = *mask++; |
|
4623 if (m) |
|
4624 { |
|
4625 *dst = pack_1x128_32 |
|
4626 (_mm_adds_epu16 |
|
4627 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), |
|
4628 unpack_32_1x128 (*dst))); |
|
4629 } |
|
4630 dst++; |
|
4631 w--; |
|
4632 } |
|
4633 |
|
4634 while (w >= 4) |
|
4635 { |
|
4636 uint32_t m = *(uint32_t*)mask; |
|
4637 if (m) |
|
4638 { |
|
4639 __m128i xmm_mask_lo, xmm_mask_hi; |
|
4640 __m128i xmm_dst_lo, xmm_dst_hi; |
|
4641 |
|
4642 __m128i xmm_dst = load_128_aligned ((__m128i*)dst); |
|
4643 __m128i xmm_mask = |
|
4644 _mm_unpacklo_epi8 (unpack_32_1x128(m), |
|
4645 _mm_setzero_si128 ()); |
|
4646 |
|
4647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
4648 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
4649 |
|
4650 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
|
4651 &xmm_mask_lo, &xmm_mask_hi); |
|
4652 |
|
4653 pix_multiply_2x128 (&xmm_src, &xmm_src, |
|
4654 &xmm_mask_lo, &xmm_mask_hi, |
|
4655 &xmm_mask_lo, &xmm_mask_hi); |
|
4656 |
|
4657 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); |
|
4658 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); |
|
4659 |
|
4660 save_128_aligned ( |
|
4661 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
4662 } |
|
4663 |
|
4664 w -= 4; |
|
4665 dst += 4; |
|
4666 mask += 4; |
|
4667 } |
|
4668 |
|
4669 while (w) |
|
4670 { |
|
4671 uint8_t m = *mask++; |
|
4672 if (m) |
|
4673 { |
|
4674 *dst = pack_1x128_32 |
|
4675 (_mm_adds_epu16 |
|
4676 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), |
|
4677 unpack_32_1x128 (*dst))); |
|
4678 } |
|
4679 dst++; |
|
4680 w--; |
|
4681 } |
|
4682 } |
|
4683 } |
|
4684 |
|
4685 static pixman_bool_t |
|
4686 sse2_blt (pixman_implementation_t *imp, |
|
4687 uint32_t * src_bits, |
|
4688 uint32_t * dst_bits, |
|
4689 int src_stride, |
|
4690 int dst_stride, |
|
4691 int src_bpp, |
|
4692 int dst_bpp, |
|
4693 int src_x, |
|
4694 int src_y, |
|
4695 int dest_x, |
|
4696 int dest_y, |
|
4697 int width, |
|
4698 int height) |
|
4699 { |
|
4700 uint8_t * src_bytes; |
|
4701 uint8_t * dst_bytes; |
|
4702 int byte_width; |
|
4703 |
|
4704 if (src_bpp != dst_bpp) |
|
4705 return FALSE; |
|
4706 |
|
4707 if (src_bpp == 16) |
|
4708 { |
|
4709 src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
|
4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
|
4711 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
|
4712 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
|
4713 byte_width = 2 * width; |
|
4714 src_stride *= 2; |
|
4715 dst_stride *= 2; |
|
4716 } |
|
4717 else if (src_bpp == 32) |
|
4718 { |
|
4719 src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
|
4720 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
|
4721 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
|
4722 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
|
4723 byte_width = 4 * width; |
|
4724 src_stride *= 4; |
|
4725 dst_stride *= 4; |
|
4726 } |
|
4727 else |
|
4728 { |
|
4729 return FALSE; |
|
4730 } |
|
4731 |
|
4732 while (height--) |
|
4733 { |
|
4734 int w; |
|
4735 uint8_t *s = src_bytes; |
|
4736 uint8_t *d = dst_bytes; |
|
4737 src_bytes += src_stride; |
|
4738 dst_bytes += dst_stride; |
|
4739 w = byte_width; |
|
4740 |
|
4741 while (w >= 2 && ((uintptr_t)d & 3)) |
|
4742 { |
|
4743 *(uint16_t *)d = *(uint16_t *)s; |
|
4744 w -= 2; |
|
4745 s += 2; |
|
4746 d += 2; |
|
4747 } |
|
4748 |
|
4749 while (w >= 4 && ((uintptr_t)d & 15)) |
|
4750 { |
|
4751 *(uint32_t *)d = *(uint32_t *)s; |
|
4752 |
|
4753 w -= 4; |
|
4754 s += 4; |
|
4755 d += 4; |
|
4756 } |
|
4757 |
|
4758 while (w >= 64) |
|
4759 { |
|
4760 __m128i xmm0, xmm1, xmm2, xmm3; |
|
4761 |
|
4762 xmm0 = load_128_unaligned ((__m128i*)(s)); |
|
4763 xmm1 = load_128_unaligned ((__m128i*)(s + 16)); |
|
4764 xmm2 = load_128_unaligned ((__m128i*)(s + 32)); |
|
4765 xmm3 = load_128_unaligned ((__m128i*)(s + 48)); |
|
4766 |
|
4767 save_128_aligned ((__m128i*)(d), xmm0); |
|
4768 save_128_aligned ((__m128i*)(d + 16), xmm1); |
|
4769 save_128_aligned ((__m128i*)(d + 32), xmm2); |
|
4770 save_128_aligned ((__m128i*)(d + 48), xmm3); |
|
4771 |
|
4772 s += 64; |
|
4773 d += 64; |
|
4774 w -= 64; |
|
4775 } |
|
4776 |
|
4777 while (w >= 16) |
|
4778 { |
|
4779 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); |
|
4780 |
|
4781 w -= 16; |
|
4782 d += 16; |
|
4783 s += 16; |
|
4784 } |
|
4785 |
|
4786 while (w >= 4) |
|
4787 { |
|
4788 *(uint32_t *)d = *(uint32_t *)s; |
|
4789 |
|
4790 w -= 4; |
|
4791 s += 4; |
|
4792 d += 4; |
|
4793 } |
|
4794 |
|
4795 if (w >= 2) |
|
4796 { |
|
4797 *(uint16_t *)d = *(uint16_t *)s; |
|
4798 w -= 2; |
|
4799 s += 2; |
|
4800 d += 2; |
|
4801 } |
|
4802 } |
|
4803 |
|
4804 return TRUE; |
|
4805 } |
|
4806 |
|
4807 static void |
|
4808 sse2_composite_copy_area (pixman_implementation_t *imp, |
|
4809 pixman_composite_info_t *info) |
|
4810 { |
|
4811 PIXMAN_COMPOSITE_ARGS (info); |
|
4812 sse2_blt (imp, src_image->bits.bits, |
|
4813 dest_image->bits.bits, |
|
4814 src_image->bits.rowstride, |
|
4815 dest_image->bits.rowstride, |
|
4816 PIXMAN_FORMAT_BPP (src_image->bits.format), |
|
4817 PIXMAN_FORMAT_BPP (dest_image->bits.format), |
|
4818 src_x, src_y, dest_x, dest_y, width, height); |
|
4819 } |
|
4820 |
|
4821 static void |
|
4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
|
4823 pixman_composite_info_t *info) |
|
4824 { |
|
4825 PIXMAN_COMPOSITE_ARGS (info); |
|
4826 uint32_t *src, *src_line, s; |
|
4827 uint32_t *dst, *dst_line, d; |
|
4828 uint8_t *mask, *mask_line; |
|
4829 uint32_t m; |
|
4830 int src_stride, mask_stride, dst_stride; |
|
4831 int32_t w; |
|
4832 __m128i ms; |
|
4833 |
|
4834 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
4835 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
4836 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
4837 |
|
4838 PIXMAN_IMAGE_GET_LINE ( |
|
4839 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
4840 PIXMAN_IMAGE_GET_LINE ( |
|
4841 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
4842 PIXMAN_IMAGE_GET_LINE ( |
|
4843 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
4844 |
|
4845 while (height--) |
|
4846 { |
|
4847 src = src_line; |
|
4848 src_line += src_stride; |
|
4849 dst = dst_line; |
|
4850 dst_line += dst_stride; |
|
4851 mask = mask_line; |
|
4852 mask_line += mask_stride; |
|
4853 |
|
4854 w = width; |
|
4855 |
|
4856 while (w && (uintptr_t)dst & 15) |
|
4857 { |
|
4858 s = 0xff000000 | *src++; |
|
4859 m = (uint32_t) *mask++; |
|
4860 d = *dst; |
|
4861 ms = unpack_32_1x128 (s); |
|
4862 |
|
4863 if (m != 0xff) |
|
4864 { |
|
4865 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
|
4866 __m128i md = unpack_32_1x128 (d); |
|
4867 |
|
4868 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); |
|
4869 } |
|
4870 |
|
4871 *dst++ = pack_1x128_32 (ms); |
|
4872 w--; |
|
4873 } |
|
4874 |
|
4875 while (w >= 4) |
|
4876 { |
|
4877 m = *(uint32_t*) mask; |
|
4878 xmm_src = _mm_or_si128 ( |
|
4879 load_128_unaligned ((__m128i*)src), mask_ff000000); |
|
4880 |
|
4881 if (m == 0xffffffff) |
|
4882 { |
|
4883 save_128_aligned ((__m128i*)dst, xmm_src); |
|
4884 } |
|
4885 else |
|
4886 { |
|
4887 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
4888 |
|
4889 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
|
4890 |
|
4891 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
4892 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
4893 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
4894 |
|
4895 expand_alpha_rev_2x128 ( |
|
4896 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
4897 |
|
4898 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
4899 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, |
|
4900 &xmm_dst_lo, &xmm_dst_hi); |
|
4901 |
|
4902 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
4903 } |
|
4904 |
|
4905 src += 4; |
|
4906 dst += 4; |
|
4907 mask += 4; |
|
4908 w -= 4; |
|
4909 } |
|
4910 |
|
4911 while (w) |
|
4912 { |
|
4913 m = (uint32_t) *mask++; |
|
4914 |
|
4915 if (m) |
|
4916 { |
|
4917 s = 0xff000000 | *src; |
|
4918 |
|
4919 if (m == 0xff) |
|
4920 { |
|
4921 *dst = s; |
|
4922 } |
|
4923 else |
|
4924 { |
|
4925 __m128i ma, md, ms; |
|
4926 |
|
4927 d = *dst; |
|
4928 |
|
4929 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
|
4930 md = unpack_32_1x128 (d); |
|
4931 ms = unpack_32_1x128 (s); |
|
4932 |
|
4933 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); |
|
4934 } |
|
4935 |
|
4936 } |
|
4937 |
|
4938 src++; |
|
4939 dst++; |
|
4940 w--; |
|
4941 } |
|
4942 } |
|
4943 |
|
4944 } |
|
4945 |
|
4946 static void |
|
4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, |
|
4948 pixman_composite_info_t *info) |
|
4949 { |
|
4950 PIXMAN_COMPOSITE_ARGS (info); |
|
4951 uint32_t *src, *src_line, s; |
|
4952 uint32_t *dst, *dst_line, d; |
|
4953 uint8_t *mask, *mask_line; |
|
4954 uint32_t m; |
|
4955 int src_stride, mask_stride, dst_stride; |
|
4956 int32_t w; |
|
4957 |
|
4958 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
|
4959 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
4960 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
4961 |
|
4962 PIXMAN_IMAGE_GET_LINE ( |
|
4963 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
4964 PIXMAN_IMAGE_GET_LINE ( |
|
4965 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
4966 PIXMAN_IMAGE_GET_LINE ( |
|
4967 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
4968 |
|
4969 while (height--) |
|
4970 { |
|
4971 src = src_line; |
|
4972 src_line += src_stride; |
|
4973 dst = dst_line; |
|
4974 dst_line += dst_stride; |
|
4975 mask = mask_line; |
|
4976 mask_line += mask_stride; |
|
4977 |
|
4978 w = width; |
|
4979 |
|
4980 while (w && (uintptr_t)dst & 15) |
|
4981 { |
|
4982 uint32_t sa; |
|
4983 |
|
4984 s = *src++; |
|
4985 m = (uint32_t) *mask++; |
|
4986 d = *dst; |
|
4987 |
|
4988 sa = s >> 24; |
|
4989 |
|
4990 if (m) |
|
4991 { |
|
4992 if (sa == 0xff && m == 0xff) |
|
4993 { |
|
4994 *dst = s; |
|
4995 } |
|
4996 else |
|
4997 { |
|
4998 __m128i ms, md, ma, msa; |
|
4999 |
|
5000 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
|
5001 ms = unpack_32_1x128 (s); |
|
5002 md = unpack_32_1x128 (d); |
|
5003 |
|
5004 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
|
5005 |
|
5006 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
|
5007 } |
|
5008 } |
|
5009 |
|
5010 dst++; |
|
5011 w--; |
|
5012 } |
|
5013 |
|
5014 while (w >= 4) |
|
5015 { |
|
5016 m = *(uint32_t *) mask; |
|
5017 |
|
5018 if (m) |
|
5019 { |
|
5020 xmm_src = load_128_unaligned ((__m128i*)src); |
|
5021 |
|
5022 if (m == 0xffffffff && is_opaque (xmm_src)) |
|
5023 { |
|
5024 save_128_aligned ((__m128i *)dst, xmm_src); |
|
5025 } |
|
5026 else |
|
5027 { |
|
5028 xmm_dst = load_128_aligned ((__m128i *)dst); |
|
5029 |
|
5030 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
|
5031 |
|
5032 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
5033 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
5034 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
5035 |
|
5036 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
|
5037 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
5038 |
|
5039 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
|
5040 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
5041 |
|
5042 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
5043 } |
|
5044 } |
|
5045 |
|
5046 src += 4; |
|
5047 dst += 4; |
|
5048 mask += 4; |
|
5049 w -= 4; |
|
5050 } |
|
5051 |
|
5052 while (w) |
|
5053 { |
|
5054 uint32_t sa; |
|
5055 |
|
5056 s = *src++; |
|
5057 m = (uint32_t) *mask++; |
|
5058 d = *dst; |
|
5059 |
|
5060 sa = s >> 24; |
|
5061 |
|
5062 if (m) |
|
5063 { |
|
5064 if (sa == 0xff && m == 0xff) |
|
5065 { |
|
5066 *dst = s; |
|
5067 } |
|
5068 else |
|
5069 { |
|
5070 __m128i ms, md, ma, msa; |
|
5071 |
|
5072 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
|
5073 ms = unpack_32_1x128 (s); |
|
5074 md = unpack_32_1x128 (d); |
|
5075 |
|
5076 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
|
5077 |
|
5078 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
|
5079 } |
|
5080 } |
|
5081 |
|
5082 dst++; |
|
5083 w--; |
|
5084 } |
|
5085 } |
|
5086 |
|
5087 } |
|
5088 |
|
5089 static void |
|
5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
|
5091 pixman_composite_info_t *info) |
|
5092 { |
|
5093 PIXMAN_COMPOSITE_ARGS (info); |
|
5094 uint32_t src; |
|
5095 uint32_t *dst_line, *dst; |
|
5096 __m128i xmm_src; |
|
5097 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
5098 __m128i xmm_dsta_hi, xmm_dsta_lo; |
|
5099 int dst_stride; |
|
5100 int32_t w; |
|
5101 |
|
5102 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
5103 |
|
5104 if (src == 0) |
|
5105 return; |
|
5106 |
|
5107 PIXMAN_IMAGE_GET_LINE ( |
|
5108 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
5109 |
|
5110 xmm_src = expand_pixel_32_1x128 (src); |
|
5111 |
|
5112 while (height--) |
|
5113 { |
|
5114 dst = dst_line; |
|
5115 |
|
5116 dst_line += dst_stride; |
|
5117 w = width; |
|
5118 |
|
5119 while (w && (uintptr_t)dst & 15) |
|
5120 { |
|
5121 __m128i vd; |
|
5122 |
|
5123 vd = unpack_32_1x128 (*dst); |
|
5124 |
|
5125 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), |
|
5126 xmm_src)); |
|
5127 w--; |
|
5128 dst++; |
|
5129 } |
|
5130 |
|
5131 while (w >= 4) |
|
5132 { |
|
5133 __m128i tmp_lo, tmp_hi; |
|
5134 |
|
5135 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
5136 |
|
5137 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
5138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); |
|
5139 |
|
5140 tmp_lo = xmm_src; |
|
5141 tmp_hi = xmm_src; |
|
5142 |
|
5143 over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
|
5144 &xmm_dsta_lo, &xmm_dsta_hi, |
|
5145 &tmp_lo, &tmp_hi); |
|
5146 |
|
5147 save_128_aligned ( |
|
5148 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); |
|
5149 |
|
5150 w -= 4; |
|
5151 dst += 4; |
|
5152 } |
|
5153 |
|
5154 while (w) |
|
5155 { |
|
5156 __m128i vd; |
|
5157 |
|
5158 vd = unpack_32_1x128 (*dst); |
|
5159 |
|
5160 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), |
|
5161 xmm_src)); |
|
5162 w--; |
|
5163 dst++; |
|
5164 } |
|
5165 |
|
5166 } |
|
5167 |
|
5168 } |
|
5169 |
|
5170 static void |
|
5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, |
|
5172 pixman_composite_info_t *info) |
|
5173 { |
|
5174 PIXMAN_COMPOSITE_ARGS (info); |
|
5175 uint32_t *src, *src_line, s; |
|
5176 uint32_t *dst, *dst_line, d; |
|
5177 uint32_t *mask, *mask_line; |
|
5178 uint32_t m; |
|
5179 int src_stride, mask_stride, dst_stride; |
|
5180 int32_t w; |
|
5181 |
|
5182 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
|
5183 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
5184 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
5185 |
|
5186 PIXMAN_IMAGE_GET_LINE ( |
|
5187 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
5188 PIXMAN_IMAGE_GET_LINE ( |
|
5189 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
|
5190 PIXMAN_IMAGE_GET_LINE ( |
|
5191 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
5192 |
|
5193 while (height--) |
|
5194 { |
|
5195 src = src_line; |
|
5196 src_line += src_stride; |
|
5197 dst = dst_line; |
|
5198 dst_line += dst_stride; |
|
5199 mask = mask_line; |
|
5200 mask_line += mask_stride; |
|
5201 |
|
5202 w = width; |
|
5203 |
|
5204 while (w && (uintptr_t)dst & 15) |
|
5205 { |
|
5206 uint32_t sa; |
|
5207 |
|
5208 s = *src++; |
|
5209 m = (*mask++) >> 24; |
|
5210 d = *dst; |
|
5211 |
|
5212 sa = s >> 24; |
|
5213 |
|
5214 if (m) |
|
5215 { |
|
5216 if (sa == 0xff && m == 0xff) |
|
5217 { |
|
5218 *dst = s; |
|
5219 } |
|
5220 else |
|
5221 { |
|
5222 __m128i ms, md, ma, msa; |
|
5223 |
|
5224 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
|
5225 ms = unpack_32_1x128 (s); |
|
5226 md = unpack_32_1x128 (d); |
|
5227 |
|
5228 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
|
5229 |
|
5230 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
|
5231 } |
|
5232 } |
|
5233 |
|
5234 dst++; |
|
5235 w--; |
|
5236 } |
|
5237 |
|
5238 while (w >= 4) |
|
5239 { |
|
5240 xmm_mask = load_128_unaligned ((__m128i*)mask); |
|
5241 |
|
5242 if (!is_transparent (xmm_mask)) |
|
5243 { |
|
5244 xmm_src = load_128_unaligned ((__m128i*)src); |
|
5245 |
|
5246 if (is_opaque (xmm_mask) && is_opaque (xmm_src)) |
|
5247 { |
|
5248 save_128_aligned ((__m128i *)dst, xmm_src); |
|
5249 } |
|
5250 else |
|
5251 { |
|
5252 xmm_dst = load_128_aligned ((__m128i *)dst); |
|
5253 |
|
5254 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
5255 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
5256 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
5257 |
|
5258 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
|
5259 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
5260 |
|
5261 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
|
5262 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
5263 |
|
5264 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
5265 } |
|
5266 } |
|
5267 |
|
5268 src += 4; |
|
5269 dst += 4; |
|
5270 mask += 4; |
|
5271 w -= 4; |
|
5272 } |
|
5273 |
|
5274 while (w) |
|
5275 { |
|
5276 uint32_t sa; |
|
5277 |
|
5278 s = *src++; |
|
5279 m = (*mask++) >> 24; |
|
5280 d = *dst; |
|
5281 |
|
5282 sa = s >> 24; |
|
5283 |
|
5284 if (m) |
|
5285 { |
|
5286 if (sa == 0xff && m == 0xff) |
|
5287 { |
|
5288 *dst = s; |
|
5289 } |
|
5290 else |
|
5291 { |
|
5292 __m128i ms, md, ma, msa; |
|
5293 |
|
5294 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
|
5295 ms = unpack_32_1x128 (s); |
|
5296 md = unpack_32_1x128 (d); |
|
5297 |
|
5298 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
|
5299 |
|
5300 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
|
5301 } |
|
5302 } |
|
5303 |
|
5304 dst++; |
|
5305 w--; |
|
5306 } |
|
5307 } |
|
5308 |
|
5309 } |
|
5310 |
|
5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */ |
|
5312 static force_inline void |
|
5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, |
|
5314 const uint32_t* ps, |
|
5315 int32_t w, |
|
5316 pixman_fixed_t vx, |
|
5317 pixman_fixed_t unit_x, |
|
5318 pixman_fixed_t src_width_fixed, |
|
5319 pixman_bool_t fully_transparent_src) |
|
5320 { |
|
5321 uint32_t s, d; |
|
5322 const uint32_t* pm = NULL; |
|
5323 |
|
5324 __m128i xmm_dst_lo, xmm_dst_hi; |
|
5325 __m128i xmm_src_lo, xmm_src_hi; |
|
5326 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
5327 |
|
5328 if (fully_transparent_src) |
|
5329 return; |
|
5330 |
|
5331 /* Align dst on a 16-byte boundary */ |
|
5332 while (w && ((uintptr_t)pd & 15)) |
|
5333 { |
|
5334 d = *pd; |
|
5335 s = combine1 (ps + pixman_fixed_to_int (vx), pm); |
|
5336 vx += unit_x; |
|
5337 while (vx >= 0) |
|
5338 vx -= src_width_fixed; |
|
5339 |
|
5340 *pd++ = core_combine_over_u_pixel_sse2 (s, d); |
|
5341 if (pm) |
|
5342 pm++; |
|
5343 w--; |
|
5344 } |
|
5345 |
|
5346 while (w >= 4) |
|
5347 { |
|
5348 __m128i tmp; |
|
5349 uint32_t tmp1, tmp2, tmp3, tmp4; |
|
5350 |
|
5351 tmp1 = *(ps + pixman_fixed_to_int (vx)); |
|
5352 vx += unit_x; |
|
5353 while (vx >= 0) |
|
5354 vx -= src_width_fixed; |
|
5355 tmp2 = *(ps + pixman_fixed_to_int (vx)); |
|
5356 vx += unit_x; |
|
5357 while (vx >= 0) |
|
5358 vx -= src_width_fixed; |
|
5359 tmp3 = *(ps + pixman_fixed_to_int (vx)); |
|
5360 vx += unit_x; |
|
5361 while (vx >= 0) |
|
5362 vx -= src_width_fixed; |
|
5363 tmp4 = *(ps + pixman_fixed_to_int (vx)); |
|
5364 vx += unit_x; |
|
5365 while (vx >= 0) |
|
5366 vx -= src_width_fixed; |
|
5367 |
|
5368 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); |
|
5369 |
|
5370 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); |
|
5371 |
|
5372 if (is_opaque (xmm_src_hi)) |
|
5373 { |
|
5374 save_128_aligned ((__m128i*)pd, xmm_src_hi); |
|
5375 } |
|
5376 else if (!is_zero (xmm_src_hi)) |
|
5377 { |
|
5378 xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
|
5379 |
|
5380 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
|
5381 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
5382 |
|
5383 expand_alpha_2x128 ( |
|
5384 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); |
|
5385 |
|
5386 over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
5387 &xmm_alpha_lo, &xmm_alpha_hi, |
|
5388 &xmm_dst_lo, &xmm_dst_hi); |
|
5389 |
|
5390 /* rebuid the 4 pixel data and save*/ |
|
5391 save_128_aligned ((__m128i*)pd, |
|
5392 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
5393 } |
|
5394 |
|
5395 w -= 4; |
|
5396 pd += 4; |
|
5397 if (pm) |
|
5398 pm += 4; |
|
5399 } |
|
5400 |
|
5401 while (w) |
|
5402 { |
|
5403 d = *pd; |
|
5404 s = combine1 (ps + pixman_fixed_to_int (vx), pm); |
|
5405 vx += unit_x; |
|
5406 while (vx >= 0) |
|
5407 vx -= src_width_fixed; |
|
5408 |
|
5409 *pd++ = core_combine_over_u_pixel_sse2 (s, d); |
|
5410 if (pm) |
|
5411 pm++; |
|
5412 |
|
5413 w--; |
|
5414 } |
|
5415 } |
|
5416 |
|
5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, |
|
5418 scaled_nearest_scanline_sse2_8888_8888_OVER, |
|
5419 uint32_t, uint32_t, COVER) |
|
5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, |
|
5421 scaled_nearest_scanline_sse2_8888_8888_OVER, |
|
5422 uint32_t, uint32_t, NONE) |
|
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, |
|
5424 scaled_nearest_scanline_sse2_8888_8888_OVER, |
|
5425 uint32_t, uint32_t, PAD) |
|
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, |
|
5427 scaled_nearest_scanline_sse2_8888_8888_OVER, |
|
5428 uint32_t, uint32_t, NORMAL) |
|
5429 |
|
5430 static force_inline void |
|
5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, |
|
5432 uint32_t * dst, |
|
5433 const uint32_t * src, |
|
5434 int32_t w, |
|
5435 pixman_fixed_t vx, |
|
5436 pixman_fixed_t unit_x, |
|
5437 pixman_fixed_t src_width_fixed, |
|
5438 pixman_bool_t zero_src) |
|
5439 { |
|
5440 __m128i xmm_mask; |
|
5441 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
5442 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
5443 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
5444 |
|
5445 if (zero_src || (*mask >> 24) == 0) |
|
5446 return; |
|
5447 |
|
5448 xmm_mask = create_mask_16_128 (*mask >> 24); |
|
5449 |
|
5450 while (w && (uintptr_t)dst & 15) |
|
5451 { |
|
5452 uint32_t s = *(src + pixman_fixed_to_int (vx)); |
|
5453 vx += unit_x; |
|
5454 while (vx >= 0) |
|
5455 vx -= src_width_fixed; |
|
5456 |
|
5457 if (s) |
|
5458 { |
|
5459 uint32_t d = *dst; |
|
5460 |
|
5461 __m128i ms = unpack_32_1x128 (s); |
|
5462 __m128i alpha = expand_alpha_1x128 (ms); |
|
5463 __m128i dest = xmm_mask; |
|
5464 __m128i alpha_dst = unpack_32_1x128 (d); |
|
5465 |
|
5466 *dst = pack_1x128_32 ( |
|
5467 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
|
5468 } |
|
5469 dst++; |
|
5470 w--; |
|
5471 } |
|
5472 |
|
5473 while (w >= 4) |
|
5474 { |
|
5475 uint32_t tmp1, tmp2, tmp3, tmp4; |
|
5476 |
|
5477 tmp1 = *(src + pixman_fixed_to_int (vx)); |
|
5478 vx += unit_x; |
|
5479 while (vx >= 0) |
|
5480 vx -= src_width_fixed; |
|
5481 tmp2 = *(src + pixman_fixed_to_int (vx)); |
|
5482 vx += unit_x; |
|
5483 while (vx >= 0) |
|
5484 vx -= src_width_fixed; |
|
5485 tmp3 = *(src + pixman_fixed_to_int (vx)); |
|
5486 vx += unit_x; |
|
5487 while (vx >= 0) |
|
5488 vx -= src_width_fixed; |
|
5489 tmp4 = *(src + pixman_fixed_to_int (vx)); |
|
5490 vx += unit_x; |
|
5491 while (vx >= 0) |
|
5492 vx -= src_width_fixed; |
|
5493 |
|
5494 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); |
|
5495 |
|
5496 if (!is_zero (xmm_src)) |
|
5497 { |
|
5498 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
5499 |
|
5500 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
5501 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
5502 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
5503 &xmm_alpha_lo, &xmm_alpha_hi); |
|
5504 |
|
5505 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
5506 &xmm_alpha_lo, &xmm_alpha_hi, |
|
5507 &xmm_mask, &xmm_mask, |
|
5508 &xmm_dst_lo, &xmm_dst_hi); |
|
5509 |
|
5510 save_128_aligned ( |
|
5511 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
5512 } |
|
5513 |
|
5514 dst += 4; |
|
5515 w -= 4; |
|
5516 } |
|
5517 |
|
5518 while (w) |
|
5519 { |
|
5520 uint32_t s = *(src + pixman_fixed_to_int (vx)); |
|
5521 vx += unit_x; |
|
5522 while (vx >= 0) |
|
5523 vx -= src_width_fixed; |
|
5524 |
|
5525 if (s) |
|
5526 { |
|
5527 uint32_t d = *dst; |
|
5528 |
|
5529 __m128i ms = unpack_32_1x128 (s); |
|
5530 __m128i alpha = expand_alpha_1x128 (ms); |
|
5531 __m128i mask = xmm_mask; |
|
5532 __m128i dest = unpack_32_1x128 (d); |
|
5533 |
|
5534 *dst = pack_1x128_32 ( |
|
5535 in_over_1x128 (&ms, &alpha, &mask, &dest)); |
|
5536 } |
|
5537 |
|
5538 dst++; |
|
5539 w--; |
|
5540 } |
|
5541 |
|
5542 } |
|
5543 |
|
5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, |
|
5545 scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
|
5546 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) |
|
5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, |
|
5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
|
5549 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) |
|
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, |
|
5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
|
5552 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) |
|
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, |
|
5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
|
5555 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) |
|
5556 |
|
5557 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) |
|
5558 |
|
5559 #define BILINEAR_DECLARE_VARIABLES \ |
|
5560 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ |
|
5561 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ |
|
5562 const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\ |
|
5563 const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ |
|
5564 const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\ |
|
5565 const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ |
|
5566 const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ |
|
5567 unit_x, unit_x, unit_x, unit_x); \ |
|
5568 const __m128i xmm_zero = _mm_setzero_si128 (); \ |
|
5569 __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx) |
|
5570 |
|
5571 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ |
|
5572 do { \ |
|
5573 __m128i xmm_wh, xmm_lo, xmm_hi, a; \ |
|
5574 /* fetch 2x2 pixel block into sse2 registers */ \ |
|
5575 __m128i tltr = _mm_loadl_epi64 ( \ |
|
5576 (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ |
|
5577 __m128i blbr = _mm_loadl_epi64 ( \ |
|
5578 (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ |
|
5579 vx += unit_x; \ |
|
5580 /* vertical interpolation */ \ |
|
5581 a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ |
|
5582 xmm_wt), \ |
|
5583 _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ |
|
5584 xmm_wb)); \ |
|
5585 if (BILINEAR_INTERPOLATION_BITS < 8) \ |
|
5586 { \ |
|
5587 /* calculate horizontal weights */ \ |
|
5588 xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \ |
|
5589 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ |
|
5590 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ |
|
5591 /* horizontal interpolation */ \ |
|
5592 a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ |
|
5593 a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ |
|
5594 } \ |
|
5595 else \ |
|
5596 { \ |
|
5597 /* calculate horizontal weights */ \ |
|
5598 xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \ |
|
5599 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ |
|
5600 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ |
|
5601 /* horizontal interpolation */ \ |
|
5602 xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ |
|
5603 xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ |
|
5604 a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ |
|
5605 _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ |
|
5606 } \ |
|
5607 /* shift and pack the result */ \ |
|
5608 a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ |
|
5609 a = _mm_packs_epi32 (a, a); \ |
|
5610 a = _mm_packus_epi16 (a, a); \ |
|
5611 pix = _mm_cvtsi128_si32 (a); \ |
|
5612 } while (0) |
|
5613 |
|
5614 #define BILINEAR_SKIP_ONE_PIXEL() \ |
|
5615 do { \ |
|
5616 vx += unit_x; \ |
|
5617 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ |
|
5618 } while(0) |
|
5619 |
|
5620 static force_inline void |
|
5621 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, |
|
5622 const uint32_t * mask, |
|
5623 const uint32_t * src_top, |
|
5624 const uint32_t * src_bottom, |
|
5625 int32_t w, |
|
5626 int wt, |
|
5627 int wb, |
|
5628 pixman_fixed_t vx, |
|
5629 pixman_fixed_t unit_x, |
|
5630 pixman_fixed_t max_vx, |
|
5631 pixman_bool_t zero_src) |
|
5632 { |
|
5633 BILINEAR_DECLARE_VARIABLES; |
|
5634 uint32_t pix1, pix2, pix3, pix4; |
|
5635 |
|
5636 while ((w -= 4) >= 0) |
|
5637 { |
|
5638 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5639 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
|
5640 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
|
5641 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
|
5642 *dst++ = pix1; |
|
5643 *dst++ = pix2; |
|
5644 *dst++ = pix3; |
|
5645 *dst++ = pix4; |
|
5646 } |
|
5647 |
|
5648 if (w & 2) |
|
5649 { |
|
5650 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5651 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
|
5652 *dst++ = pix1; |
|
5653 *dst++ = pix2; |
|
5654 } |
|
5655 |
|
5656 if (w & 1) |
|
5657 { |
|
5658 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5659 *dst = pix1; |
|
5660 } |
|
5661 |
|
5662 } |
|
5663 |
|
5664 /* Add extra NULL argument to the existing bilinear fast paths to indicate |
|
5665 * that we don't need two-pass processing */ |
|
5666 |
|
5667 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, |
|
5668 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
5669 uint32_t, uint32_t, uint32_t, |
|
5670 COVER, FLAG_NONE) |
|
5671 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, |
|
5672 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
5673 uint32_t, uint32_t, uint32_t, |
|
5674 PAD, FLAG_NONE) |
|
5675 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, |
|
5676 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
5677 uint32_t, uint32_t, uint32_t, |
|
5678 NONE, FLAG_NONE) |
|
5679 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, |
|
5680 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
|
5681 uint32_t, uint32_t, uint32_t, |
|
5682 NORMAL, FLAG_NONE) |
|
5683 |
|
5684 static force_inline void |
|
5685 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, |
|
5686 const uint32_t * mask, |
|
5687 const uint32_t * src_top, |
|
5688 const uint32_t * src_bottom, |
|
5689 int32_t w, |
|
5690 int wt, |
|
5691 int wb, |
|
5692 pixman_fixed_t vx, |
|
5693 pixman_fixed_t unit_x, |
|
5694 pixman_fixed_t max_vx, |
|
5695 pixman_bool_t zero_src) |
|
5696 { |
|
5697 BILINEAR_DECLARE_VARIABLES; |
|
5698 uint32_t pix1, pix2, pix3, pix4; |
|
5699 |
|
5700 while (w && ((uintptr_t)dst & 15)) |
|
5701 { |
|
5702 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5703 |
|
5704 if (pix1) |
|
5705 { |
|
5706 pix2 = *dst; |
|
5707 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); |
|
5708 } |
|
5709 |
|
5710 w--; |
|
5711 dst++; |
|
5712 } |
|
5713 |
|
5714 while (w >= 4) |
|
5715 { |
|
5716 __m128i xmm_src; |
|
5717 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; |
|
5718 __m128i xmm_alpha_hi, xmm_alpha_lo; |
|
5719 |
|
5720 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5721 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
|
5722 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
|
5723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
|
5724 |
|
5725 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); |
|
5726 |
|
5727 if (!is_zero (xmm_src)) |
|
5728 { |
|
5729 if (is_opaque (xmm_src)) |
|
5730 { |
|
5731 save_128_aligned ((__m128i *)dst, xmm_src); |
|
5732 } |
|
5733 else |
|
5734 { |
|
5735 __m128i xmm_dst = load_128_aligned ((__m128i *)dst); |
|
5736 |
|
5737 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
5738 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
5739 |
|
5740 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); |
|
5741 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, |
|
5742 &xmm_dst_lo, &xmm_dst_hi); |
|
5743 |
|
5744 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
5745 } |
|
5746 } |
|
5747 |
|
5748 w -= 4; |
|
5749 dst += 4; |
|
5750 } |
|
5751 |
|
5752 while (w) |
|
5753 { |
|
5754 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5755 |
|
5756 if (pix1) |
|
5757 { |
|
5758 pix2 = *dst; |
|
5759 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); |
|
5760 } |
|
5761 |
|
5762 w--; |
|
5763 dst++; |
|
5764 } |
|
5765 } |
|
5766 |
|
5767 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, |
|
5768 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
5769 uint32_t, uint32_t, uint32_t, |
|
5770 COVER, FLAG_NONE) |
|
5771 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, |
|
5772 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
5773 uint32_t, uint32_t, uint32_t, |
|
5774 PAD, FLAG_NONE) |
|
5775 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, |
|
5776 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
5777 uint32_t, uint32_t, uint32_t, |
|
5778 NONE, FLAG_NONE) |
|
5779 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, |
|
5780 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
|
5781 uint32_t, uint32_t, uint32_t, |
|
5782 NORMAL, FLAG_NONE) |
|
5783 |
|
5784 |
|
5785 /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented |
|
5786 as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ |
|
5787 |
|
5788 void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) |
|
5789 { |
|
5790 /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ |
|
5791 while (--width >= 0) |
|
5792 { |
|
5793 *dst = composite_over_8888_0565pixel (*src, *dst); |
|
5794 src++; |
|
5795 dst++; |
|
5796 } |
|
5797 } |
|
5798 |
|
5799 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, |
|
5800 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
5801 uint32_t, uint32_t, uint16_t, |
|
5802 COVER, FLAG_NONE) |
|
5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, |
|
5804 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
5805 uint32_t, uint32_t, uint16_t, |
|
5806 PAD, FLAG_NONE) |
|
5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, |
|
5808 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
5809 uint32_t, uint32_t, uint16_t, |
|
5810 NONE, FLAG_NONE) |
|
5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, |
|
5812 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
|
5813 uint32_t, uint32_t, uint16_t, |
|
5814 NORMAL, FLAG_NONE) |
|
5815 |
|
5816 /*****************************/ |
|
5817 |
|
5818 static force_inline void |
|
5819 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, |
|
5820 const uint8_t * mask, |
|
5821 const uint32_t * src_top, |
|
5822 const uint32_t * src_bottom, |
|
5823 int32_t w, |
|
5824 int wt, |
|
5825 int wb, |
|
5826 pixman_fixed_t vx, |
|
5827 pixman_fixed_t unit_x, |
|
5828 pixman_fixed_t max_vx, |
|
5829 pixman_bool_t zero_src) |
|
5830 { |
|
5831 BILINEAR_DECLARE_VARIABLES; |
|
5832 uint32_t pix1, pix2, pix3, pix4; |
|
5833 uint32_t m; |
|
5834 |
|
5835 while (w && ((uintptr_t)dst & 15)) |
|
5836 { |
|
5837 uint32_t sa; |
|
5838 |
|
5839 m = (uint32_t) *mask++; |
|
5840 |
|
5841 if (m) |
|
5842 { |
|
5843 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5844 sa = pix1 >> 24; |
|
5845 |
|
5846 if (sa == 0xff && m == 0xff) |
|
5847 { |
|
5848 *dst = pix1; |
|
5849 } |
|
5850 else |
|
5851 { |
|
5852 __m128i ms, md, ma, msa; |
|
5853 |
|
5854 pix2 = *dst; |
|
5855 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
|
5856 ms = unpack_32_1x128 (pix1); |
|
5857 md = unpack_32_1x128 (pix2); |
|
5858 |
|
5859 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
|
5860 |
|
5861 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
|
5862 } |
|
5863 } |
|
5864 else |
|
5865 { |
|
5866 BILINEAR_SKIP_ONE_PIXEL (); |
|
5867 } |
|
5868 |
|
5869 w--; |
|
5870 dst++; |
|
5871 } |
|
5872 |
|
5873 while (w >= 4) |
|
5874 { |
|
5875 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
|
5876 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
5877 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
|
5878 |
|
5879 m = *(uint32_t*)mask; |
|
5880 |
|
5881 if (m) |
|
5882 { |
|
5883 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5884 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
|
5885 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
|
5886 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
|
5887 |
|
5888 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); |
|
5889 |
|
5890 if (m == 0xffffffff && is_opaque (xmm_src)) |
|
5891 { |
|
5892 save_128_aligned ((__m128i *)dst, xmm_src); |
|
5893 } |
|
5894 else |
|
5895 { |
|
5896 xmm_dst = load_128_aligned ((__m128i *)dst); |
|
5897 |
|
5898 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
|
5899 |
|
5900 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
5901 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
|
5902 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
5903 |
|
5904 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
|
5905 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
|
5906 |
|
5907 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
|
5908 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
|
5909 |
|
5910 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
5911 } |
|
5912 } |
|
5913 else |
|
5914 { |
|
5915 BILINEAR_SKIP_ONE_PIXEL (); |
|
5916 BILINEAR_SKIP_ONE_PIXEL (); |
|
5917 BILINEAR_SKIP_ONE_PIXEL (); |
|
5918 BILINEAR_SKIP_ONE_PIXEL (); |
|
5919 } |
|
5920 |
|
5921 w -= 4; |
|
5922 dst += 4; |
|
5923 mask += 4; |
|
5924 } |
|
5925 |
|
5926 while (w) |
|
5927 { |
|
5928 uint32_t sa; |
|
5929 |
|
5930 m = (uint32_t) *mask++; |
|
5931 |
|
5932 if (m) |
|
5933 { |
|
5934 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
5935 sa = pix1 >> 24; |
|
5936 |
|
5937 if (sa == 0xff && m == 0xff) |
|
5938 { |
|
5939 *dst = pix1; |
|
5940 } |
|
5941 else |
|
5942 { |
|
5943 __m128i ms, md, ma, msa; |
|
5944 |
|
5945 pix2 = *dst; |
|
5946 ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
|
5947 ms = unpack_32_1x128 (pix1); |
|
5948 md = unpack_32_1x128 (pix2); |
|
5949 |
|
5950 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
|
5951 |
|
5952 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
|
5953 } |
|
5954 } |
|
5955 else |
|
5956 { |
|
5957 BILINEAR_SKIP_ONE_PIXEL (); |
|
5958 } |
|
5959 |
|
5960 w--; |
|
5961 dst++; |
|
5962 } |
|
5963 } |
|
5964 |
|
5965 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, |
|
5966 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
5967 uint32_t, uint8_t, uint32_t, |
|
5968 COVER, FLAG_HAVE_NON_SOLID_MASK) |
|
5969 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, |
|
5970 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
5971 uint32_t, uint8_t, uint32_t, |
|
5972 PAD, FLAG_HAVE_NON_SOLID_MASK) |
|
5973 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, |
|
5974 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
5975 uint32_t, uint8_t, uint32_t, |
|
5976 NONE, FLAG_HAVE_NON_SOLID_MASK) |
|
5977 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, |
|
5978 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
|
5979 uint32_t, uint8_t, uint32_t, |
|
5980 NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
|
5981 |
|
5982 static force_inline void |
|
5983 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, |
|
5984 const uint32_t * mask, |
|
5985 const uint32_t * src_top, |
|
5986 const uint32_t * src_bottom, |
|
5987 int32_t w, |
|
5988 int wt, |
|
5989 int wb, |
|
5990 pixman_fixed_t vx, |
|
5991 pixman_fixed_t unit_x, |
|
5992 pixman_fixed_t max_vx, |
|
5993 pixman_bool_t zero_src) |
|
5994 { |
|
5995 BILINEAR_DECLARE_VARIABLES; |
|
5996 uint32_t pix1, pix2, pix3, pix4; |
|
5997 __m128i xmm_mask; |
|
5998 |
|
5999 if (zero_src || (*mask >> 24) == 0) |
|
6000 return; |
|
6001 |
|
6002 xmm_mask = create_mask_16_128 (*mask >> 24); |
|
6003 |
|
6004 while (w && ((uintptr_t)dst & 15)) |
|
6005 { |
|
6006 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
6007 if (pix1) |
|
6008 { |
|
6009 uint32_t d = *dst; |
|
6010 |
|
6011 __m128i ms = unpack_32_1x128 (pix1); |
|
6012 __m128i alpha = expand_alpha_1x128 (ms); |
|
6013 __m128i dest = xmm_mask; |
|
6014 __m128i alpha_dst = unpack_32_1x128 (d); |
|
6015 |
|
6016 *dst = pack_1x128_32 |
|
6017 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
|
6018 } |
|
6019 |
|
6020 dst++; |
|
6021 w--; |
|
6022 } |
|
6023 |
|
6024 while (w >= 4) |
|
6025 { |
|
6026 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
6027 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
|
6028 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
|
6029 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
|
6030 |
|
6031 if (pix1 | pix2 | pix3 | pix4) |
|
6032 { |
|
6033 __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
|
6034 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
|
6035 __m128i xmm_alpha_lo, xmm_alpha_hi; |
|
6036 |
|
6037 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); |
|
6038 |
|
6039 xmm_dst = load_128_aligned ((__m128i*)dst); |
|
6040 |
|
6041 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
|
6042 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
|
6043 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
|
6044 &xmm_alpha_lo, &xmm_alpha_hi); |
|
6045 |
|
6046 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
|
6047 &xmm_alpha_lo, &xmm_alpha_hi, |
|
6048 &xmm_mask, &xmm_mask, |
|
6049 &xmm_dst_lo, &xmm_dst_hi); |
|
6050 |
|
6051 save_128_aligned |
|
6052 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
|
6053 } |
|
6054 |
|
6055 dst += 4; |
|
6056 w -= 4; |
|
6057 } |
|
6058 |
|
6059 while (w) |
|
6060 { |
|
6061 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
6062 if (pix1) |
|
6063 { |
|
6064 uint32_t d = *dst; |
|
6065 |
|
6066 __m128i ms = unpack_32_1x128 (pix1); |
|
6067 __m128i alpha = expand_alpha_1x128 (ms); |
|
6068 __m128i dest = xmm_mask; |
|
6069 __m128i alpha_dst = unpack_32_1x128 (d); |
|
6070 |
|
6071 *dst = pack_1x128_32 |
|
6072 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
|
6073 } |
|
6074 |
|
6075 dst++; |
|
6076 w--; |
|
6077 } |
|
6078 } |
|
6079 |
|
6080 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, |
|
6081 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
|
6082 uint32_t, uint32_t, uint32_t, |
|
6083 COVER, FLAG_HAVE_SOLID_MASK) |
|
6084 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, |
|
6085 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
|
6086 uint32_t, uint32_t, uint32_t, |
|
6087 PAD, FLAG_HAVE_SOLID_MASK) |
|
6088 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, |
|
6089 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
|
6090 uint32_t, uint32_t, uint32_t, |
|
6091 NONE, FLAG_HAVE_SOLID_MASK) |
|
6092 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, |
|
6093 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
|
6094 uint32_t, uint32_t, uint32_t, |
|
6095 NORMAL, FLAG_HAVE_SOLID_MASK) |
|
6096 |
|
6097 static const pixman_fast_path_t sse2_fast_paths[] = |
|
6098 { |
|
6099 /* PIXMAN_OP_OVER */ |
|
6100 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), |
|
6101 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), |
|
6102 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), |
|
6103 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), |
|
6104 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), |
|
6105 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), |
|
6106 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), |
|
6107 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), |
|
6108 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), |
|
6109 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), |
|
6110 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), |
|
6111 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), |
|
6112 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), |
|
6113 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), |
|
6114 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), |
|
6115 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), |
|
6116 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), |
|
6117 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), |
|
6118 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), |
|
6119 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), |
|
6120 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), |
|
6121 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), |
|
6122 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), |
|
6123 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), |
|
6124 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), |
|
6125 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), |
|
6126 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), |
|
6127 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), |
|
6128 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), |
|
6129 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), |
|
6130 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), |
|
6131 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), |
|
6132 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), |
|
6133 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), |
|
6134 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), |
|
6135 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), |
|
6136 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), |
|
6137 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), |
|
6138 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), |
|
6139 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), |
|
6140 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), |
|
6141 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), |
|
6142 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), |
|
6143 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), |
|
6144 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), |
|
6145 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
|
6146 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
|
6147 |
|
6148 /* PIXMAN_OP_OVER_REVERSE */ |
|
6149 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), |
|
6150 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), |
|
6151 |
|
6152 /* PIXMAN_OP_ADD */ |
|
6153 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), |
|
6154 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), |
|
6155 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), |
|
6156 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), |
|
6157 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), |
|
6158 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), |
|
6159 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), |
|
6160 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), |
|
6161 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), |
|
6162 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), |
|
6163 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), |
|
6164 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), |
|
6165 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), |
|
6166 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), |
|
6167 |
|
6168 /* PIXMAN_OP_SRC */ |
|
6169 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), |
|
6170 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), |
|
6171 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), |
|
6172 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), |
|
6173 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), |
|
6174 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), |
|
6175 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), |
|
6176 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), |
|
6177 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), |
|
6178 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), |
|
6179 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), |
|
6180 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), |
|
6181 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
|
6182 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
|
6183 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
|
6184 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
|
6185 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), |
|
6186 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), |
|
6187 |
|
6188 /* PIXMAN_OP_IN */ |
|
6189 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), |
|
6190 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), |
|
6191 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), |
|
6192 |
|
6193 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6194 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6195 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
6196 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
6197 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6198 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6199 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
6200 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
6201 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6202 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6203 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
6204 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
6205 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6206 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6207 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
6208 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
6209 |
|
6210 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
|
6211 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
|
6212 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
|
6213 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
|
6214 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
|
6215 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
|
6216 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
|
6217 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
|
6218 |
|
6219 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
6220 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6221 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6222 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
6223 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6224 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6225 |
|
6226 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
|
6227 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
|
6228 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
|
6229 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
|
6230 |
|
6231 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
|
6232 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
|
6233 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
|
6234 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
|
6235 |
|
6236 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), |
|
6237 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), |
|
6238 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), |
|
6239 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), |
|
6240 |
|
6241 /* and here the needed entries are added to the fast path table */ |
|
6242 |
|
6243 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), |
|
6244 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), |
|
6245 |
|
6246 { PIXMAN_OP_NONE }, |
|
6247 }; |
|
6248 |
|
6249 static uint32_t * |
|
6250 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
|
6251 { |
|
6252 int w = iter->width; |
|
6253 __m128i ff000000 = mask_ff000000; |
|
6254 uint32_t *dst = iter->buffer; |
|
6255 uint32_t *src = (uint32_t *)iter->bits; |
|
6256 |
|
6257 iter->bits += iter->stride; |
|
6258 |
|
6259 while (w && ((uintptr_t)dst) & 0x0f) |
|
6260 { |
|
6261 *dst++ = (*src++) | 0xff000000; |
|
6262 w--; |
|
6263 } |
|
6264 |
|
6265 while (w >= 4) |
|
6266 { |
|
6267 save_128_aligned ( |
|
6268 (__m128i *)dst, _mm_or_si128 ( |
|
6269 load_128_unaligned ((__m128i *)src), ff000000)); |
|
6270 |
|
6271 dst += 4; |
|
6272 src += 4; |
|
6273 w -= 4; |
|
6274 } |
|
6275 |
|
6276 while (w) |
|
6277 { |
|
6278 *dst++ = (*src++) | 0xff000000; |
|
6279 w--; |
|
6280 } |
|
6281 |
|
6282 return iter->buffer; |
|
6283 } |
|
6284 |
|
6285 static uint32_t * |
|
6286 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
|
6287 { |
|
6288 int w = iter->width; |
|
6289 uint32_t *dst = iter->buffer; |
|
6290 uint16_t *src = (uint16_t *)iter->bits; |
|
6291 __m128i ff000000 = mask_ff000000; |
|
6292 |
|
6293 iter->bits += iter->stride; |
|
6294 |
|
6295 while (w && ((uintptr_t)dst) & 0x0f) |
|
6296 { |
|
6297 uint16_t s = *src++; |
|
6298 |
|
6299 *dst++ = convert_0565_to_8888 (s); |
|
6300 w--; |
|
6301 } |
|
6302 |
|
6303 while (w >= 8) |
|
6304 { |
|
6305 __m128i lo, hi, s; |
|
6306 |
|
6307 s = _mm_loadu_si128 ((__m128i *)src); |
|
6308 |
|
6309 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); |
|
6310 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); |
|
6311 |
|
6312 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); |
|
6313 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); |
|
6314 |
|
6315 dst += 8; |
|
6316 src += 8; |
|
6317 w -= 8; |
|
6318 } |
|
6319 |
|
6320 while (w) |
|
6321 { |
|
6322 uint16_t s = *src++; |
|
6323 |
|
6324 *dst++ = convert_0565_to_8888 (s); |
|
6325 w--; |
|
6326 } |
|
6327 |
|
6328 return iter->buffer; |
|
6329 } |
|
6330 |
|
6331 static uint32_t * |
|
6332 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
|
6333 { |
|
6334 int w = iter->width; |
|
6335 uint32_t *dst = iter->buffer; |
|
6336 uint8_t *src = iter->bits; |
|
6337 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; |
|
6338 |
|
6339 iter->bits += iter->stride; |
|
6340 |
|
6341 while (w && (((uintptr_t)dst) & 15)) |
|
6342 { |
|
6343 *dst++ = *(src++) << 24; |
|
6344 w--; |
|
6345 } |
|
6346 |
|
6347 while (w >= 16) |
|
6348 { |
|
6349 xmm0 = _mm_loadu_si128((__m128i *)src); |
|
6350 |
|
6351 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); |
|
6352 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); |
|
6353 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); |
|
6354 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); |
|
6355 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); |
|
6356 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); |
|
6357 |
|
6358 _mm_store_si128(((__m128i *)(dst + 0)), xmm3); |
|
6359 _mm_store_si128(((__m128i *)(dst + 4)), xmm4); |
|
6360 _mm_store_si128(((__m128i *)(dst + 8)), xmm5); |
|
6361 _mm_store_si128(((__m128i *)(dst + 12)), xmm6); |
|
6362 |
|
6363 dst += 16; |
|
6364 src += 16; |
|
6365 w -= 16; |
|
6366 } |
|
6367 |
|
6368 while (w) |
|
6369 { |
|
6370 *dst++ = *(src++) << 24; |
|
6371 w--; |
|
6372 } |
|
6373 |
|
6374 return iter->buffer; |
|
6375 } |
|
6376 |
|
6377 typedef struct |
|
6378 { |
|
6379 pixman_format_code_t format; |
|
6380 pixman_iter_get_scanline_t get_scanline; |
|
6381 } fetcher_info_t; |
|
6382 |
|
6383 static const fetcher_info_t fetchers[] = |
|
6384 { |
|
6385 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, |
|
6386 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, |
|
6387 { PIXMAN_a8, sse2_fetch_a8 }, |
|
6388 { PIXMAN_null } |
|
6389 }; |
|
6390 |
|
6391 static pixman_bool_t |
|
6392 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) |
|
6393 { |
|
6394 pixman_image_t *image = iter->image; |
|
6395 |
|
6396 #define FLAGS \ |
|
6397 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
|
6398 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
|
6399 |
|
6400 if ((iter->iter_flags & ITER_NARROW) && |
|
6401 (iter->image_flags & FLAGS) == FLAGS) |
|
6402 { |
|
6403 const fetcher_info_t *f; |
|
6404 |
|
6405 for (f = &fetchers[0]; f->format != PIXMAN_null; f++) |
|
6406 { |
|
6407 if (image->common.extended_format_code == f->format) |
|
6408 { |
|
6409 uint8_t *b = (uint8_t *)image->bits.bits; |
|
6410 int s = image->bits.rowstride * 4; |
|
6411 |
|
6412 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; |
|
6413 iter->stride = s; |
|
6414 |
|
6415 iter->get_scanline = f->get_scanline; |
|
6416 return TRUE; |
|
6417 } |
|
6418 } |
|
6419 } |
|
6420 |
|
6421 return FALSE; |
|
6422 } |
|
6423 |
|
6424 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) |
|
6425 __attribute__((__force_align_arg_pointer__)) |
|
6426 #endif |
|
6427 pixman_implementation_t * |
|
6428 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) |
|
6429 { |
|
6430 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); |
|
6431 |
|
6432 /* SSE2 constants */ |
|
6433 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); |
|
6434 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); |
|
6435 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); |
|
6436 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); |
|
6437 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); |
|
6438 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); |
|
6439 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); |
|
6440 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); |
|
6441 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); |
|
6442 mask_0080 = create_mask_16_128 (0x0080); |
|
6443 mask_00ff = create_mask_16_128 (0x00ff); |
|
6444 mask_0101 = create_mask_16_128 (0x0101); |
|
6445 mask_ffff = create_mask_16_128 (0xffff); |
|
6446 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); |
|
6447 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); |
|
6448 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); |
|
6449 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); |
|
6450 |
|
6451 /* Set up function pointers */ |
|
6452 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; |
|
6453 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; |
|
6454 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; |
|
6455 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; |
|
6456 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; |
|
6457 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; |
|
6458 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; |
|
6459 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; |
|
6460 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; |
|
6461 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; |
|
6462 |
|
6463 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; |
|
6464 |
|
6465 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; |
|
6466 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; |
|
6467 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; |
|
6468 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; |
|
6469 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; |
|
6470 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; |
|
6471 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; |
|
6472 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; |
|
6473 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; |
|
6474 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; |
|
6475 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; |
|
6476 |
|
6477 imp->blt = sse2_blt; |
|
6478 imp->fill = sse2_fill; |
|
6479 |
|
6480 imp->src_iter_init = sse2_src_iter_init; |
|
6481 |
|
6482 return imp; |
|
6483 } |