gfx/cairo/libpixman/src/pixman-sse2.c

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:9e7e4db23e80
1 /*
2 * Copyright © 2008 Rodrigo Kumpera
3 * Copyright © 2008 André Tupinambá
4 *
5 * Permission to use, copy, modify, distribute, and sell this software and its
6 * documentation for any purpose is hereby granted without fee, provided that
7 * the above copyright notice appear in all copies and that both that
8 * copyright notice and this permission notice appear in supporting
9 * documentation, and that the name of Red Hat not be used in advertising or
10 * publicity pertaining to distribution of the software without specific,
11 * written prior permission. Red Hat makes no representations about the
12 * suitability of this software for any purpose. It is provided "as is"
13 * without express or implied warranty.
14 *
15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22 * SOFTWARE.
23 *
24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
25 * André Tupinambá (andrelrt@gmail.com)
26 *
27 * Based on work by Owen Taylor and Søren Sandmann
28 */
29 #ifdef HAVE_CONFIG_H
30 #include <config.h>
31 #endif
32
33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34 #include <emmintrin.h> /* for SSE2 intrinsics */
35 #include "pixman-private.h"
36 #include "pixman-combine32.h"
37 #include "pixman-inlines.h"
38
39 static __m128i mask_0080;
40 static __m128i mask_00ff;
41 static __m128i mask_0101;
42 static __m128i mask_ffff;
43 static __m128i mask_ff000000;
44 static __m128i mask_alpha;
45
46 static __m128i mask_565_r;
47 static __m128i mask_565_g1, mask_565_g2;
48 static __m128i mask_565_b;
49 static __m128i mask_red;
50 static __m128i mask_green;
51 static __m128i mask_blue;
52
53 static __m128i mask_565_fix_rb;
54 static __m128i mask_565_fix_g;
55
56 static __m128i mask_565_rb;
57 static __m128i mask_565_pack_multiplier;
58
59 static force_inline __m128i
60 unpack_32_1x128 (uint32_t data)
61 {
62 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
63 }
64
65 static force_inline void
66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
67 {
68 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
69 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
70 }
71
72 static force_inline __m128i
73 unpack_565_to_8888 (__m128i lo)
74 {
75 __m128i r, g, b, rb, t;
76
77 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
78 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
79 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
80
81 rb = _mm_or_si128 (r, b);
82 t = _mm_and_si128 (rb, mask_565_fix_rb);
83 t = _mm_srli_epi32 (t, 5);
84 rb = _mm_or_si128 (rb, t);
85
86 t = _mm_and_si128 (g, mask_565_fix_g);
87 t = _mm_srli_epi32 (t, 6);
88 g = _mm_or_si128 (g, t);
89
90 return _mm_or_si128 (rb, g);
91 }
92
93 static force_inline void
94 unpack_565_128_4x128 (__m128i data,
95 __m128i* data0,
96 __m128i* data1,
97 __m128i* data2,
98 __m128i* data3)
99 {
100 __m128i lo, hi;
101
102 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
103 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
104
105 lo = unpack_565_to_8888 (lo);
106 hi = unpack_565_to_8888 (hi);
107
108 unpack_128_2x128 (lo, data0, data1);
109 unpack_128_2x128 (hi, data2, data3);
110 }
111
112 static force_inline uint16_t
113 pack_565_32_16 (uint32_t pixel)
114 {
115 return (uint16_t) (((pixel >> 8) & 0xf800) |
116 ((pixel >> 5) & 0x07e0) |
117 ((pixel >> 3) & 0x001f));
118 }
119
120 static force_inline __m128i
121 pack_2x128_128 (__m128i lo, __m128i hi)
122 {
123 return _mm_packus_epi16 (lo, hi);
124 }
125
126 static force_inline __m128i
127 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
128 {
129 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
130 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
131
132 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
133 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
134
135 __m128i g0 = _mm_and_si128 (lo, mask_green);
136 __m128i g1 = _mm_and_si128 (hi, mask_green);
137
138 t0 = _mm_or_si128 (t0, g0);
139 t1 = _mm_or_si128 (t1, g1);
140
141 /* Simulates _mm_packus_epi32 */
142 t0 = _mm_slli_epi32 (t0, 16 - 5);
143 t1 = _mm_slli_epi32 (t1, 16 - 5);
144 t0 = _mm_srai_epi32 (t0, 16);
145 t1 = _mm_srai_epi32 (t1, 16);
146 return _mm_packs_epi32 (t0, t1);
147 }
148
149 static force_inline __m128i
150 pack_565_2x128_128 (__m128i lo, __m128i hi)
151 {
152 __m128i data;
153 __m128i r, g1, g2, b;
154
155 data = pack_2x128_128 (lo, hi);
156
157 r = _mm_and_si128 (data, mask_565_r);
158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163 }
164
165 static force_inline __m128i
166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167 {
168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169 pack_565_2x128_128 (*xmm2, *xmm3));
170 }
171
172 static force_inline int
173 is_opaque (__m128i x)
174 {
175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178 }
179
180 static force_inline int
181 is_zero (__m128i x)
182 {
183 return _mm_movemask_epi8 (
184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185 }
186
187 static force_inline int
188 is_transparent (__m128i x)
189 {
190 return (_mm_movemask_epi8 (
191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192 }
193
194 static force_inline __m128i
195 expand_pixel_32_1x128 (uint32_t data)
196 {
197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198 }
199
200 static force_inline __m128i
201 expand_alpha_1x128 (__m128i data)
202 {
203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204 _MM_SHUFFLE (3, 3, 3, 3)),
205 _MM_SHUFFLE (3, 3, 3, 3));
206 }
207
208 static force_inline void
209 expand_alpha_2x128 (__m128i data_lo,
210 __m128i data_hi,
211 __m128i* alpha_lo,
212 __m128i* alpha_hi)
213 {
214 __m128i lo, hi;
215
216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221 }
222
223 static force_inline void
224 expand_alpha_rev_2x128 (__m128i data_lo,
225 __m128i data_hi,
226 __m128i* alpha_lo,
227 __m128i* alpha_hi)
228 {
229 __m128i lo, hi;
230
231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235 }
236
237 static force_inline void
238 pix_multiply_2x128 (__m128i* data_lo,
239 __m128i* data_hi,
240 __m128i* alpha_lo,
241 __m128i* alpha_hi,
242 __m128i* ret_lo,
243 __m128i* ret_hi)
244 {
245 __m128i lo, hi;
246
247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249 lo = _mm_adds_epu16 (lo, mask_0080);
250 hi = _mm_adds_epu16 (hi, mask_0080);
251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253 }
254
255 static force_inline void
256 pix_add_multiply_2x128 (__m128i* src_lo,
257 __m128i* src_hi,
258 __m128i* alpha_dst_lo,
259 __m128i* alpha_dst_hi,
260 __m128i* dst_lo,
261 __m128i* dst_hi,
262 __m128i* alpha_src_lo,
263 __m128i* alpha_src_hi,
264 __m128i* ret_lo,
265 __m128i* ret_hi)
266 {
267 __m128i t1_lo, t1_hi;
268 __m128i t2_lo, t2_hi;
269
270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275 }
276
277 static force_inline void
278 negate_2x128 (__m128i data_lo,
279 __m128i data_hi,
280 __m128i* neg_lo,
281 __m128i* neg_hi)
282 {
283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285 }
286
287 static force_inline void
288 invert_colors_2x128 (__m128i data_lo,
289 __m128i data_hi,
290 __m128i* inv_lo,
291 __m128i* inv_hi)
292 {
293 __m128i lo, hi;
294
295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299 }
300
301 static force_inline void
302 over_2x128 (__m128i* src_lo,
303 __m128i* src_hi,
304 __m128i* alpha_lo,
305 __m128i* alpha_hi,
306 __m128i* dst_lo,
307 __m128i* dst_hi)
308 {
309 __m128i t1, t2;
310
311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317 }
318
319 static force_inline void
320 over_rev_non_pre_2x128 (__m128i src_lo,
321 __m128i src_hi,
322 __m128i* dst_lo,
323 __m128i* dst_hi)
324 {
325 __m128i lo, hi;
326 __m128i alpha_lo, alpha_hi;
327
328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338 }
339
340 static force_inline void
341 in_over_2x128 (__m128i* src_lo,
342 __m128i* src_hi,
343 __m128i* alpha_lo,
344 __m128i* alpha_hi,
345 __m128i* mask_lo,
346 __m128i* mask_hi,
347 __m128i* dst_lo,
348 __m128i* dst_hi)
349 {
350 __m128i s_lo, s_hi;
351 __m128i a_lo, a_hi;
352
353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357 }
358
359 /* load 4 pixels from a 16-byte boundary aligned address */
360 static force_inline __m128i
361 load_128_aligned (__m128i* src)
362 {
363 return _mm_load_si128 (src);
364 }
365
366 /* load 4 pixels from a unaligned address */
367 static force_inline __m128i
368 load_128_unaligned (const __m128i* src)
369 {
370 return _mm_loadu_si128 (src);
371 }
372
373 /* save 4 pixels using Write Combining memory on a 16-byte
374 * boundary aligned address
375 */
376 static force_inline void
377 save_128_write_combining (__m128i* dst,
378 __m128i data)
379 {
380 _mm_stream_si128 (dst, data);
381 }
382
383 /* save 4 pixels on a 16-byte boundary aligned address */
384 static force_inline void
385 save_128_aligned (__m128i* dst,
386 __m128i data)
387 {
388 _mm_store_si128 (dst, data);
389 }
390
391 /* save 4 pixels on a unaligned address */
392 static force_inline void
393 save_128_unaligned (__m128i* dst,
394 __m128i data)
395 {
396 _mm_storeu_si128 (dst, data);
397 }
398
399 static force_inline __m128i
400 load_32_1x128 (uint32_t data)
401 {
402 return _mm_cvtsi32_si128 (data);
403 }
404
405 static force_inline __m128i
406 expand_alpha_rev_1x128 (__m128i data)
407 {
408 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
409 }
410
411 static force_inline __m128i
412 expand_pixel_8_1x128 (uint8_t data)
413 {
414 return _mm_shufflelo_epi16 (
415 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
416 }
417
418 static force_inline __m128i
419 pix_multiply_1x128 (__m128i data,
420 __m128i alpha)
421 {
422 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
423 mask_0080),
424 mask_0101);
425 }
426
427 static force_inline __m128i
428 pix_add_multiply_1x128 (__m128i* src,
429 __m128i* alpha_dst,
430 __m128i* dst,
431 __m128i* alpha_src)
432 {
433 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
434 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
435
436 return _mm_adds_epu8 (t1, t2);
437 }
438
439 static force_inline __m128i
440 negate_1x128 (__m128i data)
441 {
442 return _mm_xor_si128 (data, mask_00ff);
443 }
444
445 static force_inline __m128i
446 invert_colors_1x128 (__m128i data)
447 {
448 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
449 }
450
451 static force_inline __m128i
452 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
453 {
454 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
455 }
456
457 static force_inline __m128i
458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
459 {
460 return over_1x128 (pix_multiply_1x128 (*src, *mask),
461 pix_multiply_1x128 (*alpha, *mask),
462 *dst);
463 }
464
465 static force_inline __m128i
466 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
467 {
468 __m128i alpha = expand_alpha_1x128 (src);
469
470 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
471 _mm_or_si128 (alpha, mask_alpha)),
472 alpha,
473 dst);
474 }
475
476 static force_inline uint32_t
477 pack_1x128_32 (__m128i data)
478 {
479 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
480 }
481
482 static force_inline __m128i
483 expand565_16_1x128 (uint16_t pixel)
484 {
485 __m128i m = _mm_cvtsi32_si128 (pixel);
486
487 m = unpack_565_to_8888 (m);
488
489 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
490 }
491
492 static force_inline uint32_t
493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
494 {
495 uint8_t a;
496 __m128i xmms;
497
498 a = src >> 24;
499
500 if (a == 0xff)
501 {
502 return src;
503 }
504 else if (src)
505 {
506 xmms = unpack_32_1x128 (src);
507 return pack_1x128_32 (
508 over_1x128 (xmms, expand_alpha_1x128 (xmms),
509 unpack_32_1x128 (dst)));
510 }
511
512 return dst;
513 }
514
515 static force_inline uint32_t
516 combine1 (const uint32_t *ps, const uint32_t *pm)
517 {
518 uint32_t s = *ps;
519
520 if (pm)
521 {
522 __m128i ms, mm;
523
524 mm = unpack_32_1x128 (*pm);
525 mm = expand_alpha_1x128 (mm);
526
527 ms = unpack_32_1x128 (s);
528 ms = pix_multiply_1x128 (ms, mm);
529
530 s = pack_1x128_32 (ms);
531 }
532
533 return s;
534 }
535
536 static force_inline __m128i
537 combine4 (const __m128i *ps, const __m128i *pm)
538 {
539 __m128i xmm_src_lo, xmm_src_hi;
540 __m128i xmm_msk_lo, xmm_msk_hi;
541 __m128i s;
542
543 if (pm)
544 {
545 xmm_msk_lo = load_128_unaligned (pm);
546
547 if (is_transparent (xmm_msk_lo))
548 return _mm_setzero_si128 ();
549 }
550
551 s = load_128_unaligned (ps);
552
553 if (pm)
554 {
555 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
556 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
557
558 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
559
560 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
561 &xmm_msk_lo, &xmm_msk_hi,
562 &xmm_src_lo, &xmm_src_hi);
563
564 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
565 }
566
567 return s;
568 }
569
570 static force_inline void
571 core_combine_over_u_sse2_mask (uint32_t * pd,
572 const uint32_t* ps,
573 const uint32_t* pm,
574 int w)
575 {
576 uint32_t s, d;
577
578 /* Align dst on a 16-byte boundary */
579 while (w && ((uintptr_t)pd & 15))
580 {
581 d = *pd;
582 s = combine1 (ps, pm);
583
584 if (s)
585 *pd = core_combine_over_u_pixel_sse2 (s, d);
586 pd++;
587 ps++;
588 pm++;
589 w--;
590 }
591
592 while (w >= 4)
593 {
594 __m128i mask = load_128_unaligned ((__m128i *)pm);
595
596 if (!is_zero (mask))
597 {
598 __m128i src;
599 __m128i src_hi, src_lo;
600 __m128i mask_hi, mask_lo;
601 __m128i alpha_hi, alpha_lo;
602
603 src = load_128_unaligned ((__m128i *)ps);
604
605 if (is_opaque (_mm_and_si128 (src, mask)))
606 {
607 save_128_aligned ((__m128i *)pd, src);
608 }
609 else
610 {
611 __m128i dst = load_128_aligned ((__m128i *)pd);
612 __m128i dst_hi, dst_lo;
613
614 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
615 unpack_128_2x128 (src, &src_lo, &src_hi);
616
617 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
618 pix_multiply_2x128 (&src_lo, &src_hi,
619 &mask_lo, &mask_hi,
620 &src_lo, &src_hi);
621
622 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
623
624 expand_alpha_2x128 (src_lo, src_hi,
625 &alpha_lo, &alpha_hi);
626
627 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
628 &dst_lo, &dst_hi);
629
630 save_128_aligned (
631 (__m128i *)pd,
632 pack_2x128_128 (dst_lo, dst_hi));
633 }
634 }
635
636 pm += 4;
637 ps += 4;
638 pd += 4;
639 w -= 4;
640 }
641 while (w)
642 {
643 d = *pd;
644 s = combine1 (ps, pm);
645
646 if (s)
647 *pd = core_combine_over_u_pixel_sse2 (s, d);
648 pd++;
649 ps++;
650 pm++;
651
652 w--;
653 }
654 }
655
656 static force_inline void
657 core_combine_over_u_sse2_no_mask (uint32_t * pd,
658 const uint32_t* ps,
659 int w)
660 {
661 uint32_t s, d;
662
663 /* Align dst on a 16-byte boundary */
664 while (w && ((uintptr_t)pd & 15))
665 {
666 d = *pd;
667 s = *ps;
668
669 if (s)
670 *pd = core_combine_over_u_pixel_sse2 (s, d);
671 pd++;
672 ps++;
673 w--;
674 }
675
676 while (w >= 4)
677 {
678 __m128i src;
679 __m128i src_hi, src_lo, dst_hi, dst_lo;
680 __m128i alpha_hi, alpha_lo;
681
682 src = load_128_unaligned ((__m128i *)ps);
683
684 if (!is_zero (src))
685 {
686 if (is_opaque (src))
687 {
688 save_128_aligned ((__m128i *)pd, src);
689 }
690 else
691 {
692 __m128i dst = load_128_aligned ((__m128i *)pd);
693
694 unpack_128_2x128 (src, &src_lo, &src_hi);
695 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
696
697 expand_alpha_2x128 (src_lo, src_hi,
698 &alpha_lo, &alpha_hi);
699 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
700 &dst_lo, &dst_hi);
701
702 save_128_aligned (
703 (__m128i *)pd,
704 pack_2x128_128 (dst_lo, dst_hi));
705 }
706 }
707
708 ps += 4;
709 pd += 4;
710 w -= 4;
711 }
712 while (w)
713 {
714 d = *pd;
715 s = *ps;
716
717 if (s)
718 *pd = core_combine_over_u_pixel_sse2 (s, d);
719 pd++;
720 ps++;
721
722 w--;
723 }
724 }
725
726 static force_inline void
727 sse2_combine_over_u (pixman_implementation_t *imp,
728 pixman_op_t op,
729 uint32_t * pd,
730 const uint32_t * ps,
731 const uint32_t * pm,
732 int w)
733 {
734 if (pm)
735 core_combine_over_u_sse2_mask (pd, ps, pm, w);
736 else
737 core_combine_over_u_sse2_no_mask (pd, ps, w);
738 }
739
740 static void
741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
742 pixman_op_t op,
743 uint32_t * pd,
744 const uint32_t * ps,
745 const uint32_t * pm,
746 int w)
747 {
748 uint32_t s, d;
749
750 __m128i xmm_dst_lo, xmm_dst_hi;
751 __m128i xmm_src_lo, xmm_src_hi;
752 __m128i xmm_alpha_lo, xmm_alpha_hi;
753
754 /* Align dst on a 16-byte boundary */
755 while (w &&
756 ((uintptr_t)pd & 15))
757 {
758 d = *pd;
759 s = combine1 (ps, pm);
760
761 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
762 w--;
763 ps++;
764 if (pm)
765 pm++;
766 }
767
768 while (w >= 4)
769 {
770 /* I'm loading unaligned because I'm not sure
771 * about the address alignment.
772 */
773 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
774 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
775
776 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
777 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
778
779 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
780 &xmm_alpha_lo, &xmm_alpha_hi);
781
782 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
783 &xmm_alpha_lo, &xmm_alpha_hi,
784 &xmm_src_lo, &xmm_src_hi);
785
786 /* rebuid the 4 pixel data and save*/
787 save_128_aligned ((__m128i*)pd,
788 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
789
790 w -= 4;
791 ps += 4;
792 pd += 4;
793
794 if (pm)
795 pm += 4;
796 }
797
798 while (w)
799 {
800 d = *pd;
801 s = combine1 (ps, pm);
802
803 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
804 ps++;
805 w--;
806 if (pm)
807 pm++;
808 }
809 }
810
811 static force_inline uint32_t
812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
813 {
814 uint32_t maska = src >> 24;
815
816 if (maska == 0)
817 {
818 return 0;
819 }
820 else if (maska != 0xff)
821 {
822 return pack_1x128_32 (
823 pix_multiply_1x128 (unpack_32_1x128 (dst),
824 expand_alpha_1x128 (unpack_32_1x128 (src))));
825 }
826
827 return dst;
828 }
829
830 static void
831 sse2_combine_in_u (pixman_implementation_t *imp,
832 pixman_op_t op,
833 uint32_t * pd,
834 const uint32_t * ps,
835 const uint32_t * pm,
836 int w)
837 {
838 uint32_t s, d;
839
840 __m128i xmm_src_lo, xmm_src_hi;
841 __m128i xmm_dst_lo, xmm_dst_hi;
842
843 while (w && ((uintptr_t)pd & 15))
844 {
845 s = combine1 (ps, pm);
846 d = *pd;
847
848 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
849 w--;
850 ps++;
851 if (pm)
852 pm++;
853 }
854
855 while (w >= 4)
856 {
857 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
859
860 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
862
863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865 &xmm_dst_lo, &xmm_dst_hi,
866 &xmm_dst_lo, &xmm_dst_hi);
867
868 save_128_aligned ((__m128i*)pd,
869 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
870
871 ps += 4;
872 pd += 4;
873 w -= 4;
874 if (pm)
875 pm += 4;
876 }
877
878 while (w)
879 {
880 s = combine1 (ps, pm);
881 d = *pd;
882
883 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
884 w--;
885 ps++;
886 if (pm)
887 pm++;
888 }
889 }
890
891 static void
892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
893 pixman_op_t op,
894 uint32_t * pd,
895 const uint32_t * ps,
896 const uint32_t * pm,
897 int w)
898 {
899 uint32_t s, d;
900
901 __m128i xmm_src_lo, xmm_src_hi;
902 __m128i xmm_dst_lo, xmm_dst_hi;
903
904 while (w && ((uintptr_t)pd & 15))
905 {
906 s = combine1 (ps, pm);
907 d = *pd;
908
909 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
910 ps++;
911 w--;
912 if (pm)
913 pm++;
914 }
915
916 while (w >= 4)
917 {
918 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
919 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
920
921 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
922 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
923
924 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
925 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
926 &xmm_src_lo, &xmm_src_hi,
927 &xmm_dst_lo, &xmm_dst_hi);
928
929 save_128_aligned (
930 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
931
932 ps += 4;
933 pd += 4;
934 w -= 4;
935 if (pm)
936 pm += 4;
937 }
938
939 while (w)
940 {
941 s = combine1 (ps, pm);
942 d = *pd;
943
944 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
945 w--;
946 ps++;
947 if (pm)
948 pm++;
949 }
950 }
951
952 static void
953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
954 pixman_op_t op,
955 uint32_t * pd,
956 const uint32_t * ps,
957 const uint32_t * pm,
958 int w)
959 {
960 while (w && ((uintptr_t)pd & 15))
961 {
962 uint32_t s = combine1 (ps, pm);
963 uint32_t d = *pd;
964
965 *pd++ = pack_1x128_32 (
966 pix_multiply_1x128 (
967 unpack_32_1x128 (d), negate_1x128 (
968 expand_alpha_1x128 (unpack_32_1x128 (s)))));
969
970 if (pm)
971 pm++;
972 ps++;
973 w--;
974 }
975
976 while (w >= 4)
977 {
978 __m128i xmm_src_lo, xmm_src_hi;
979 __m128i xmm_dst_lo, xmm_dst_hi;
980
981 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
982 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
983
984 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
985 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
986
987 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989
990 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
991 &xmm_src_lo, &xmm_src_hi,
992 &xmm_dst_lo, &xmm_dst_hi);
993
994 save_128_aligned (
995 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
996
997 ps += 4;
998 pd += 4;
999 if (pm)
1000 pm += 4;
1001
1002 w -= 4;
1003 }
1004
1005 while (w)
1006 {
1007 uint32_t s = combine1 (ps, pm);
1008 uint32_t d = *pd;
1009
1010 *pd++ = pack_1x128_32 (
1011 pix_multiply_1x128 (
1012 unpack_32_1x128 (d), negate_1x128 (
1013 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1014 ps++;
1015 if (pm)
1016 pm++;
1017 w--;
1018 }
1019 }
1020
1021 static void
1022 sse2_combine_out_u (pixman_implementation_t *imp,
1023 pixman_op_t op,
1024 uint32_t * pd,
1025 const uint32_t * ps,
1026 const uint32_t * pm,
1027 int w)
1028 {
1029 while (w && ((uintptr_t)pd & 15))
1030 {
1031 uint32_t s = combine1 (ps, pm);
1032 uint32_t d = *pd;
1033
1034 *pd++ = pack_1x128_32 (
1035 pix_multiply_1x128 (
1036 unpack_32_1x128 (s), negate_1x128 (
1037 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1038 w--;
1039 ps++;
1040 if (pm)
1041 pm++;
1042 }
1043
1044 while (w >= 4)
1045 {
1046 __m128i xmm_src_lo, xmm_src_hi;
1047 __m128i xmm_dst_lo, xmm_dst_hi;
1048
1049 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1051
1052 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1054
1055 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057
1058 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059 &xmm_dst_lo, &xmm_dst_hi,
1060 &xmm_dst_lo, &xmm_dst_hi);
1061
1062 save_128_aligned (
1063 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1064
1065 ps += 4;
1066 pd += 4;
1067 w -= 4;
1068 if (pm)
1069 pm += 4;
1070 }
1071
1072 while (w)
1073 {
1074 uint32_t s = combine1 (ps, pm);
1075 uint32_t d = *pd;
1076
1077 *pd++ = pack_1x128_32 (
1078 pix_multiply_1x128 (
1079 unpack_32_1x128 (s), negate_1x128 (
1080 expand_alpha_1x128 (unpack_32_1x128 (d)))));
1081 w--;
1082 ps++;
1083 if (pm)
1084 pm++;
1085 }
1086 }
1087
1088 static force_inline uint32_t
1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
1090 uint32_t dst)
1091 {
1092 __m128i s = unpack_32_1x128 (src);
1093 __m128i d = unpack_32_1x128 (dst);
1094
1095 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096 __m128i da = expand_alpha_1x128 (d);
1097
1098 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1099 }
1100
1101 static void
1102 sse2_combine_atop_u (pixman_implementation_t *imp,
1103 pixman_op_t op,
1104 uint32_t * pd,
1105 const uint32_t * ps,
1106 const uint32_t * pm,
1107 int w)
1108 {
1109 uint32_t s, d;
1110
1111 __m128i xmm_src_lo, xmm_src_hi;
1112 __m128i xmm_dst_lo, xmm_dst_hi;
1113 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1115
1116 while (w && ((uintptr_t)pd & 15))
1117 {
1118 s = combine1 (ps, pm);
1119 d = *pd;
1120
1121 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1122 w--;
1123 ps++;
1124 if (pm)
1125 pm++;
1126 }
1127
1128 while (w >= 4)
1129 {
1130 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1132
1133 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1135
1136 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1140
1141 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1143
1144 pix_add_multiply_2x128 (
1145 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147 &xmm_dst_lo, &xmm_dst_hi);
1148
1149 save_128_aligned (
1150 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1151
1152 ps += 4;
1153 pd += 4;
1154 w -= 4;
1155 if (pm)
1156 pm += 4;
1157 }
1158
1159 while (w)
1160 {
1161 s = combine1 (ps, pm);
1162 d = *pd;
1163
1164 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1165 w--;
1166 ps++;
1167 if (pm)
1168 pm++;
1169 }
1170 }
1171
1172 static force_inline uint32_t
1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1174 uint32_t dst)
1175 {
1176 __m128i s = unpack_32_1x128 (src);
1177 __m128i d = unpack_32_1x128 (dst);
1178
1179 __m128i sa = expand_alpha_1x128 (s);
1180 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1181
1182 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1183 }
1184
1185 static void
1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1187 pixman_op_t op,
1188 uint32_t * pd,
1189 const uint32_t * ps,
1190 const uint32_t * pm,
1191 int w)
1192 {
1193 uint32_t s, d;
1194
1195 __m128i xmm_src_lo, xmm_src_hi;
1196 __m128i xmm_dst_lo, xmm_dst_hi;
1197 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1199
1200 while (w && ((uintptr_t)pd & 15))
1201 {
1202 s = combine1 (ps, pm);
1203 d = *pd;
1204
1205 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1206 ps++;
1207 w--;
1208 if (pm)
1209 pm++;
1210 }
1211
1212 while (w >= 4)
1213 {
1214 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1216
1217 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1219
1220 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1224
1225 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227
1228 pix_add_multiply_2x128 (
1229 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231 &xmm_dst_lo, &xmm_dst_hi);
1232
1233 save_128_aligned (
1234 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1235
1236 ps += 4;
1237 pd += 4;
1238 w -= 4;
1239 if (pm)
1240 pm += 4;
1241 }
1242
1243 while (w)
1244 {
1245 s = combine1 (ps, pm);
1246 d = *pd;
1247
1248 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1249 ps++;
1250 w--;
1251 if (pm)
1252 pm++;
1253 }
1254 }
1255
1256 static force_inline uint32_t
1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
1258 uint32_t dst)
1259 {
1260 __m128i s = unpack_32_1x128 (src);
1261 __m128i d = unpack_32_1x128 (dst);
1262
1263 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1265
1266 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1267 }
1268
1269 static void
1270 sse2_combine_xor_u (pixman_implementation_t *imp,
1271 pixman_op_t op,
1272 uint32_t * dst,
1273 const uint32_t * src,
1274 const uint32_t * mask,
1275 int width)
1276 {
1277 int w = width;
1278 uint32_t s, d;
1279 uint32_t* pd = dst;
1280 const uint32_t* ps = src;
1281 const uint32_t* pm = mask;
1282
1283 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1287
1288 while (w && ((uintptr_t)pd & 15))
1289 {
1290 s = combine1 (ps, pm);
1291 d = *pd;
1292
1293 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1294 w--;
1295 ps++;
1296 if (pm)
1297 pm++;
1298 }
1299
1300 while (w >= 4)
1301 {
1302 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303 xmm_dst = load_128_aligned ((__m128i*) pd);
1304
1305 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1307
1308 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1312
1313 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1317
1318 pix_add_multiply_2x128 (
1319 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321 &xmm_dst_lo, &xmm_dst_hi);
1322
1323 save_128_aligned (
1324 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1325
1326 ps += 4;
1327 pd += 4;
1328 w -= 4;
1329 if (pm)
1330 pm += 4;
1331 }
1332
1333 while (w)
1334 {
1335 s = combine1 (ps, pm);
1336 d = *pd;
1337
1338 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1339 w--;
1340 ps++;
1341 if (pm)
1342 pm++;
1343 }
1344 }
1345
1346 static force_inline void
1347 sse2_combine_add_u (pixman_implementation_t *imp,
1348 pixman_op_t op,
1349 uint32_t * dst,
1350 const uint32_t * src,
1351 const uint32_t * mask,
1352 int width)
1353 {
1354 int w = width;
1355 uint32_t s, d;
1356 uint32_t* pd = dst;
1357 const uint32_t* ps = src;
1358 const uint32_t* pm = mask;
1359
1360 while (w && (uintptr_t)pd & 15)
1361 {
1362 s = combine1 (ps, pm);
1363 d = *pd;
1364
1365 ps++;
1366 if (pm)
1367 pm++;
1368 *pd++ = _mm_cvtsi128_si32 (
1369 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1370 w--;
1371 }
1372
1373 while (w >= 4)
1374 {
1375 __m128i s;
1376
1377 s = combine4 ((__m128i*)ps, (__m128i*)pm);
1378
1379 save_128_aligned (
1380 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
1381
1382 pd += 4;
1383 ps += 4;
1384 if (pm)
1385 pm += 4;
1386 w -= 4;
1387 }
1388
1389 while (w--)
1390 {
1391 s = combine1 (ps, pm);
1392 d = *pd;
1393
1394 ps++;
1395 *pd++ = _mm_cvtsi128_si32 (
1396 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1397 if (pm)
1398 pm++;
1399 }
1400 }
1401
1402 static force_inline uint32_t
1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
1404 uint32_t dst)
1405 {
1406 __m128i ms = unpack_32_1x128 (src);
1407 __m128i md = unpack_32_1x128 (dst);
1408 uint32_t sa = src >> 24;
1409 uint32_t da = ~dst >> 24;
1410
1411 if (sa > da)
1412 {
1413 ms = pix_multiply_1x128 (
1414 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1415 }
1416
1417 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1418 }
1419
1420 static void
1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
1422 pixman_op_t op,
1423 uint32_t * pd,
1424 const uint32_t * ps,
1425 const uint32_t * pm,
1426 int w)
1427 {
1428 uint32_t s, d;
1429
1430 uint32_t pack_cmp;
1431 __m128i xmm_src, xmm_dst;
1432
1433 while (w && (uintptr_t)pd & 15)
1434 {
1435 s = combine1 (ps, pm);
1436 d = *pd;
1437
1438 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1439 w--;
1440 ps++;
1441 if (pm)
1442 pm++;
1443 }
1444
1445 while (w >= 4)
1446 {
1447 xmm_dst = load_128_aligned ((__m128i*)pd);
1448 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1449
1450 pack_cmp = _mm_movemask_epi8 (
1451 _mm_cmpgt_epi32 (
1452 _mm_srli_epi32 (xmm_src, 24),
1453 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1454
1455 /* if some alpha src is grater than respective ~alpha dst */
1456 if (pack_cmp)
1457 {
1458 s = combine1 (ps++, pm);
1459 d = *pd;
1460 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1461 if (pm)
1462 pm++;
1463
1464 s = combine1 (ps++, pm);
1465 d = *pd;
1466 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467 if (pm)
1468 pm++;
1469
1470 s = combine1 (ps++, pm);
1471 d = *pd;
1472 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473 if (pm)
1474 pm++;
1475
1476 s = combine1 (ps++, pm);
1477 d = *pd;
1478 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479 if (pm)
1480 pm++;
1481 }
1482 else
1483 {
1484 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1485
1486 pd += 4;
1487 ps += 4;
1488 if (pm)
1489 pm += 4;
1490 }
1491
1492 w -= 4;
1493 }
1494
1495 while (w--)
1496 {
1497 s = combine1 (ps, pm);
1498 d = *pd;
1499
1500 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1501 ps++;
1502 if (pm)
1503 pm++;
1504 }
1505 }
1506
1507 static void
1508 sse2_combine_src_ca (pixman_implementation_t *imp,
1509 pixman_op_t op,
1510 uint32_t * pd,
1511 const uint32_t * ps,
1512 const uint32_t * pm,
1513 int w)
1514 {
1515 uint32_t s, m;
1516
1517 __m128i xmm_src_lo, xmm_src_hi;
1518 __m128i xmm_mask_lo, xmm_mask_hi;
1519 __m128i xmm_dst_lo, xmm_dst_hi;
1520
1521 while (w && (uintptr_t)pd & 15)
1522 {
1523 s = *ps++;
1524 m = *pm++;
1525 *pd++ = pack_1x128_32 (
1526 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1527 w--;
1528 }
1529
1530 while (w >= 4)
1531 {
1532 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1534
1535 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1537
1538 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539 &xmm_mask_lo, &xmm_mask_hi,
1540 &xmm_dst_lo, &xmm_dst_hi);
1541
1542 save_128_aligned (
1543 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1544
1545 ps += 4;
1546 pd += 4;
1547 pm += 4;
1548 w -= 4;
1549 }
1550
1551 while (w)
1552 {
1553 s = *ps++;
1554 m = *pm++;
1555 *pd++ = pack_1x128_32 (
1556 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1557 w--;
1558 }
1559 }
1560
1561 static force_inline uint32_t
1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
1563 uint32_t mask,
1564 uint32_t dst)
1565 {
1566 __m128i s = unpack_32_1x128 (src);
1567 __m128i expAlpha = expand_alpha_1x128 (s);
1568 __m128i unpk_mask = unpack_32_1x128 (mask);
1569 __m128i unpk_dst = unpack_32_1x128 (dst);
1570
1571 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1572 }
1573
1574 static void
1575 sse2_combine_over_ca (pixman_implementation_t *imp,
1576 pixman_op_t op,
1577 uint32_t * pd,
1578 const uint32_t * ps,
1579 const uint32_t * pm,
1580 int w)
1581 {
1582 uint32_t s, m, d;
1583
1584 __m128i xmm_alpha_lo, xmm_alpha_hi;
1585 __m128i xmm_src_lo, xmm_src_hi;
1586 __m128i xmm_dst_lo, xmm_dst_hi;
1587 __m128i xmm_mask_lo, xmm_mask_hi;
1588
1589 while (w && (uintptr_t)pd & 15)
1590 {
1591 s = *ps++;
1592 m = *pm++;
1593 d = *pd;
1594
1595 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1596 w--;
1597 }
1598
1599 while (w >= 4)
1600 {
1601 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1604
1605 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1608
1609 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610 &xmm_alpha_lo, &xmm_alpha_hi);
1611
1612 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613 &xmm_alpha_lo, &xmm_alpha_hi,
1614 &xmm_mask_lo, &xmm_mask_hi,
1615 &xmm_dst_lo, &xmm_dst_hi);
1616
1617 save_128_aligned (
1618 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1619
1620 ps += 4;
1621 pd += 4;
1622 pm += 4;
1623 w -= 4;
1624 }
1625
1626 while (w)
1627 {
1628 s = *ps++;
1629 m = *pm++;
1630 d = *pd;
1631
1632 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1633 w--;
1634 }
1635 }
1636
1637 static force_inline uint32_t
1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1639 uint32_t mask,
1640 uint32_t dst)
1641 {
1642 __m128i d = unpack_32_1x128 (dst);
1643
1644 return pack_1x128_32 (
1645 over_1x128 (d, expand_alpha_1x128 (d),
1646 pix_multiply_1x128 (unpack_32_1x128 (src),
1647 unpack_32_1x128 (mask))));
1648 }
1649
1650 static void
1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1652 pixman_op_t op,
1653 uint32_t * pd,
1654 const uint32_t * ps,
1655 const uint32_t * pm,
1656 int w)
1657 {
1658 uint32_t s, m, d;
1659
1660 __m128i xmm_alpha_lo, xmm_alpha_hi;
1661 __m128i xmm_src_lo, xmm_src_hi;
1662 __m128i xmm_dst_lo, xmm_dst_hi;
1663 __m128i xmm_mask_lo, xmm_mask_hi;
1664
1665 while (w && (uintptr_t)pd & 15)
1666 {
1667 s = *ps++;
1668 m = *pm++;
1669 d = *pd;
1670
1671 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1672 w--;
1673 }
1674
1675 while (w >= 4)
1676 {
1677 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1680
1681 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1684
1685 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686 &xmm_alpha_lo, &xmm_alpha_hi);
1687 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688 &xmm_mask_lo, &xmm_mask_hi,
1689 &xmm_mask_lo, &xmm_mask_hi);
1690
1691 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692 &xmm_alpha_lo, &xmm_alpha_hi,
1693 &xmm_mask_lo, &xmm_mask_hi);
1694
1695 save_128_aligned (
1696 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1697
1698 ps += 4;
1699 pd += 4;
1700 pm += 4;
1701 w -= 4;
1702 }
1703
1704 while (w)
1705 {
1706 s = *ps++;
1707 m = *pm++;
1708 d = *pd;
1709
1710 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1711 w--;
1712 }
1713 }
1714
1715 static void
1716 sse2_combine_in_ca (pixman_implementation_t *imp,
1717 pixman_op_t op,
1718 uint32_t * pd,
1719 const uint32_t * ps,
1720 const uint32_t * pm,
1721 int w)
1722 {
1723 uint32_t s, m, d;
1724
1725 __m128i xmm_alpha_lo, xmm_alpha_hi;
1726 __m128i xmm_src_lo, xmm_src_hi;
1727 __m128i xmm_dst_lo, xmm_dst_hi;
1728 __m128i xmm_mask_lo, xmm_mask_hi;
1729
1730 while (w && (uintptr_t)pd & 15)
1731 {
1732 s = *ps++;
1733 m = *pm++;
1734 d = *pd;
1735
1736 *pd++ = pack_1x128_32 (
1737 pix_multiply_1x128 (
1738 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739 expand_alpha_1x128 (unpack_32_1x128 (d))));
1740
1741 w--;
1742 }
1743
1744 while (w >= 4)
1745 {
1746 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1749
1750 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1753
1754 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755 &xmm_alpha_lo, &xmm_alpha_hi);
1756
1757 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758 &xmm_mask_lo, &xmm_mask_hi,
1759 &xmm_dst_lo, &xmm_dst_hi);
1760
1761 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762 &xmm_alpha_lo, &xmm_alpha_hi,
1763 &xmm_dst_lo, &xmm_dst_hi);
1764
1765 save_128_aligned (
1766 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1767
1768 ps += 4;
1769 pd += 4;
1770 pm += 4;
1771 w -= 4;
1772 }
1773
1774 while (w)
1775 {
1776 s = *ps++;
1777 m = *pm++;
1778 d = *pd;
1779
1780 *pd++ = pack_1x128_32 (
1781 pix_multiply_1x128 (
1782 pix_multiply_1x128 (
1783 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784 expand_alpha_1x128 (unpack_32_1x128 (d))));
1785
1786 w--;
1787 }
1788 }
1789
1790 static void
1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1792 pixman_op_t op,
1793 uint32_t * pd,
1794 const uint32_t * ps,
1795 const uint32_t * pm,
1796 int w)
1797 {
1798 uint32_t s, m, d;
1799
1800 __m128i xmm_alpha_lo, xmm_alpha_hi;
1801 __m128i xmm_src_lo, xmm_src_hi;
1802 __m128i xmm_dst_lo, xmm_dst_hi;
1803 __m128i xmm_mask_lo, xmm_mask_hi;
1804
1805 while (w && (uintptr_t)pd & 15)
1806 {
1807 s = *ps++;
1808 m = *pm++;
1809 d = *pd;
1810
1811 *pd++ = pack_1x128_32 (
1812 pix_multiply_1x128 (
1813 unpack_32_1x128 (d),
1814 pix_multiply_1x128 (unpack_32_1x128 (m),
1815 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1816 w--;
1817 }
1818
1819 while (w >= 4)
1820 {
1821 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824
1825 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828
1829 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830 &xmm_alpha_lo, &xmm_alpha_hi);
1831 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832 &xmm_alpha_lo, &xmm_alpha_hi,
1833 &xmm_alpha_lo, &xmm_alpha_hi);
1834
1835 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836 &xmm_alpha_lo, &xmm_alpha_hi,
1837 &xmm_dst_lo, &xmm_dst_hi);
1838
1839 save_128_aligned (
1840 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1841
1842 ps += 4;
1843 pd += 4;
1844 pm += 4;
1845 w -= 4;
1846 }
1847
1848 while (w)
1849 {
1850 s = *ps++;
1851 m = *pm++;
1852 d = *pd;
1853
1854 *pd++ = pack_1x128_32 (
1855 pix_multiply_1x128 (
1856 unpack_32_1x128 (d),
1857 pix_multiply_1x128 (unpack_32_1x128 (m),
1858 expand_alpha_1x128 (unpack_32_1x128 (s)))));
1859 w--;
1860 }
1861 }
1862
1863 static void
1864 sse2_combine_out_ca (pixman_implementation_t *imp,
1865 pixman_op_t op,
1866 uint32_t * pd,
1867 const uint32_t * ps,
1868 const uint32_t * pm,
1869 int w)
1870 {
1871 uint32_t s, m, d;
1872
1873 __m128i xmm_alpha_lo, xmm_alpha_hi;
1874 __m128i xmm_src_lo, xmm_src_hi;
1875 __m128i xmm_dst_lo, xmm_dst_hi;
1876 __m128i xmm_mask_lo, xmm_mask_hi;
1877
1878 while (w && (uintptr_t)pd & 15)
1879 {
1880 s = *ps++;
1881 m = *pm++;
1882 d = *pd;
1883
1884 *pd++ = pack_1x128_32 (
1885 pix_multiply_1x128 (
1886 pix_multiply_1x128 (
1887 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1889 w--;
1890 }
1891
1892 while (w >= 4)
1893 {
1894 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897
1898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901
1902 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903 &xmm_alpha_lo, &xmm_alpha_hi);
1904 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905 &xmm_alpha_lo, &xmm_alpha_hi);
1906
1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908 &xmm_mask_lo, &xmm_mask_hi,
1909 &xmm_dst_lo, &xmm_dst_hi);
1910 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911 &xmm_alpha_lo, &xmm_alpha_hi,
1912 &xmm_dst_lo, &xmm_dst_hi);
1913
1914 save_128_aligned (
1915 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1916
1917 ps += 4;
1918 pd += 4;
1919 pm += 4;
1920 w -= 4;
1921 }
1922
1923 while (w)
1924 {
1925 s = *ps++;
1926 m = *pm++;
1927 d = *pd;
1928
1929 *pd++ = pack_1x128_32 (
1930 pix_multiply_1x128 (
1931 pix_multiply_1x128 (
1932 unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1934
1935 w--;
1936 }
1937 }
1938
1939 static void
1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1941 pixman_op_t op,
1942 uint32_t * pd,
1943 const uint32_t * ps,
1944 const uint32_t * pm,
1945 int w)
1946 {
1947 uint32_t s, m, d;
1948
1949 __m128i xmm_alpha_lo, xmm_alpha_hi;
1950 __m128i xmm_src_lo, xmm_src_hi;
1951 __m128i xmm_dst_lo, xmm_dst_hi;
1952 __m128i xmm_mask_lo, xmm_mask_hi;
1953
1954 while (w && (uintptr_t)pd & 15)
1955 {
1956 s = *ps++;
1957 m = *pm++;
1958 d = *pd;
1959
1960 *pd++ = pack_1x128_32 (
1961 pix_multiply_1x128 (
1962 unpack_32_1x128 (d),
1963 negate_1x128 (pix_multiply_1x128 (
1964 unpack_32_1x128 (m),
1965 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1966 w--;
1967 }
1968
1969 while (w >= 4)
1970 {
1971 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1974
1975 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1978
1979 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980 &xmm_alpha_lo, &xmm_alpha_hi);
1981
1982 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983 &xmm_alpha_lo, &xmm_alpha_hi,
1984 &xmm_mask_lo, &xmm_mask_hi);
1985
1986 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987 &xmm_mask_lo, &xmm_mask_hi);
1988
1989 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990 &xmm_mask_lo, &xmm_mask_hi,
1991 &xmm_dst_lo, &xmm_dst_hi);
1992
1993 save_128_aligned (
1994 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1995
1996 ps += 4;
1997 pd += 4;
1998 pm += 4;
1999 w -= 4;
2000 }
2001
2002 while (w)
2003 {
2004 s = *ps++;
2005 m = *pm++;
2006 d = *pd;
2007
2008 *pd++ = pack_1x128_32 (
2009 pix_multiply_1x128 (
2010 unpack_32_1x128 (d),
2011 negate_1x128 (pix_multiply_1x128 (
2012 unpack_32_1x128 (m),
2013 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2014 w--;
2015 }
2016 }
2017
2018 static force_inline uint32_t
2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
2020 uint32_t mask,
2021 uint32_t dst)
2022 {
2023 __m128i m = unpack_32_1x128 (mask);
2024 __m128i s = unpack_32_1x128 (src);
2025 __m128i d = unpack_32_1x128 (dst);
2026 __m128i sa = expand_alpha_1x128 (s);
2027 __m128i da = expand_alpha_1x128 (d);
2028
2029 s = pix_multiply_1x128 (s, m);
2030 m = negate_1x128 (pix_multiply_1x128 (m, sa));
2031
2032 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2033 }
2034
2035 static void
2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
2037 pixman_op_t op,
2038 uint32_t * pd,
2039 const uint32_t * ps,
2040 const uint32_t * pm,
2041 int w)
2042 {
2043 uint32_t s, m, d;
2044
2045 __m128i xmm_src_lo, xmm_src_hi;
2046 __m128i xmm_dst_lo, xmm_dst_hi;
2047 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049 __m128i xmm_mask_lo, xmm_mask_hi;
2050
2051 while (w && (uintptr_t)pd & 15)
2052 {
2053 s = *ps++;
2054 m = *pm++;
2055 d = *pd;
2056
2057 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2058 w--;
2059 }
2060
2061 while (w >= 4)
2062 {
2063 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2066
2067 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070
2071 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2075
2076 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077 &xmm_mask_lo, &xmm_mask_hi,
2078 &xmm_src_lo, &xmm_src_hi);
2079 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081 &xmm_mask_lo, &xmm_mask_hi);
2082
2083 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084
2085 pix_add_multiply_2x128 (
2086 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088 &xmm_dst_lo, &xmm_dst_hi);
2089
2090 save_128_aligned (
2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092
2093 ps += 4;
2094 pd += 4;
2095 pm += 4;
2096 w -= 4;
2097 }
2098
2099 while (w)
2100 {
2101 s = *ps++;
2102 m = *pm++;
2103 d = *pd;
2104
2105 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2106 w--;
2107 }
2108 }
2109
2110 static force_inline uint32_t
2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2112 uint32_t mask,
2113 uint32_t dst)
2114 {
2115 __m128i m = unpack_32_1x128 (mask);
2116 __m128i s = unpack_32_1x128 (src);
2117 __m128i d = unpack_32_1x128 (dst);
2118
2119 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120 __m128i sa = expand_alpha_1x128 (s);
2121
2122 s = pix_multiply_1x128 (s, m);
2123 m = pix_multiply_1x128 (m, sa);
2124
2125 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2126 }
2127
2128 static void
2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2130 pixman_op_t op,
2131 uint32_t * pd,
2132 const uint32_t * ps,
2133 const uint32_t * pm,
2134 int w)
2135 {
2136 uint32_t s, m, d;
2137
2138 __m128i xmm_src_lo, xmm_src_hi;
2139 __m128i xmm_dst_lo, xmm_dst_hi;
2140 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142 __m128i xmm_mask_lo, xmm_mask_hi;
2143
2144 while (w && (uintptr_t)pd & 15)
2145 {
2146 s = *ps++;
2147 m = *pm++;
2148 d = *pd;
2149
2150 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2151 w--;
2152 }
2153
2154 while (w >= 4)
2155 {
2156 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2159
2160 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2163
2164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2168
2169 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170 &xmm_mask_lo, &xmm_mask_hi,
2171 &xmm_src_lo, &xmm_src_hi);
2172 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174 &xmm_mask_lo, &xmm_mask_hi);
2175
2176 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2178
2179 pix_add_multiply_2x128 (
2180 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182 &xmm_dst_lo, &xmm_dst_hi);
2183
2184 save_128_aligned (
2185 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2186
2187 ps += 4;
2188 pd += 4;
2189 pm += 4;
2190 w -= 4;
2191 }
2192
2193 while (w)
2194 {
2195 s = *ps++;
2196 m = *pm++;
2197 d = *pd;
2198
2199 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2200 w--;
2201 }
2202 }
2203
2204 static force_inline uint32_t
2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
2206 uint32_t mask,
2207 uint32_t dst)
2208 {
2209 __m128i a = unpack_32_1x128 (mask);
2210 __m128i s = unpack_32_1x128 (src);
2211 __m128i d = unpack_32_1x128 (dst);
2212
2213 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214 a, expand_alpha_1x128 (s)));
2215 __m128i dest = pix_multiply_1x128 (s, a);
2216 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2217
2218 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2219 &alpha_dst,
2220 &dest,
2221 &alpha_src));
2222 }
2223
2224 static void
2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
2226 pixman_op_t op,
2227 uint32_t * pd,
2228 const uint32_t * ps,
2229 const uint32_t * pm,
2230 int w)
2231 {
2232 uint32_t s, m, d;
2233
2234 __m128i xmm_src_lo, xmm_src_hi;
2235 __m128i xmm_dst_lo, xmm_dst_hi;
2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238 __m128i xmm_mask_lo, xmm_mask_hi;
2239
2240 while (w && (uintptr_t)pd & 15)
2241 {
2242 s = *ps++;
2243 m = *pm++;
2244 d = *pd;
2245
2246 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247 w--;
2248 }
2249
2250 while (w >= 4)
2251 {
2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266 &xmm_mask_lo, &xmm_mask_hi,
2267 &xmm_src_lo, &xmm_src_hi);
2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270 &xmm_mask_lo, &xmm_mask_hi);
2271
2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275 &xmm_mask_lo, &xmm_mask_hi);
2276
2277 pix_add_multiply_2x128 (
2278 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280 &xmm_dst_lo, &xmm_dst_hi);
2281
2282 save_128_aligned (
2283 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284
2285 ps += 4;
2286 pd += 4;
2287 pm += 4;
2288 w -= 4;
2289 }
2290
2291 while (w)
2292 {
2293 s = *ps++;
2294 m = *pm++;
2295 d = *pd;
2296
2297 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298 w--;
2299 }
2300 }
2301
2302 static void
2303 sse2_combine_add_ca (pixman_implementation_t *imp,
2304 pixman_op_t op,
2305 uint32_t * pd,
2306 const uint32_t * ps,
2307 const uint32_t * pm,
2308 int w)
2309 {
2310 uint32_t s, m, d;
2311
2312 __m128i xmm_src_lo, xmm_src_hi;
2313 __m128i xmm_dst_lo, xmm_dst_hi;
2314 __m128i xmm_mask_lo, xmm_mask_hi;
2315
2316 while (w && (uintptr_t)pd & 15)
2317 {
2318 s = *ps++;
2319 m = *pm++;
2320 d = *pd;
2321
2322 *pd++ = pack_1x128_32 (
2323 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324 unpack_32_1x128 (m)),
2325 unpack_32_1x128 (d)));
2326 w--;
2327 }
2328
2329 while (w >= 4)
2330 {
2331 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2334
2335 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2338
2339 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340 &xmm_mask_lo, &xmm_mask_hi,
2341 &xmm_src_lo, &xmm_src_hi);
2342
2343 save_128_aligned (
2344 (__m128i*)pd, pack_2x128_128 (
2345 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2347
2348 ps += 4;
2349 pd += 4;
2350 pm += 4;
2351 w -= 4;
2352 }
2353
2354 while (w)
2355 {
2356 s = *ps++;
2357 m = *pm++;
2358 d = *pd;
2359
2360 *pd++ = pack_1x128_32 (
2361 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362 unpack_32_1x128 (m)),
2363 unpack_32_1x128 (d)));
2364 w--;
2365 }
2366 }
2367
2368 static force_inline __m128i
2369 create_mask_16_128 (uint16_t mask)
2370 {
2371 return _mm_set1_epi16 (mask);
2372 }
2373
2374 /* Work around a code generation bug in Sun Studio 12. */
2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2376 # define create_mask_2x32_128(mask0, mask1) \
2377 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2378 #else
2379 static force_inline __m128i
2380 create_mask_2x32_128 (uint32_t mask0,
2381 uint32_t mask1)
2382 {
2383 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2384 }
2385 #endif
2386
2387 static void
2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2389 pixman_composite_info_t *info)
2390 {
2391 PIXMAN_COMPOSITE_ARGS (info);
2392 uint32_t src;
2393 uint32_t *dst_line, *dst, d;
2394 int32_t w;
2395 int dst_stride;
2396 __m128i xmm_src, xmm_alpha;
2397 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2398
2399 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2400
2401 if (src == 0)
2402 return;
2403
2404 PIXMAN_IMAGE_GET_LINE (
2405 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2406
2407 xmm_src = expand_pixel_32_1x128 (src);
2408 xmm_alpha = expand_alpha_1x128 (xmm_src);
2409
2410 while (height--)
2411 {
2412 dst = dst_line;
2413
2414 dst_line += dst_stride;
2415 w = width;
2416
2417 while (w && (uintptr_t)dst & 15)
2418 {
2419 d = *dst;
2420 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2421 xmm_alpha,
2422 unpack_32_1x128 (d)));
2423 w--;
2424 }
2425
2426 while (w >= 4)
2427 {
2428 xmm_dst = load_128_aligned ((__m128i*)dst);
2429
2430 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2431
2432 over_2x128 (&xmm_src, &xmm_src,
2433 &xmm_alpha, &xmm_alpha,
2434 &xmm_dst_lo, &xmm_dst_hi);
2435
2436 /* rebuid the 4 pixel data and save*/
2437 save_128_aligned (
2438 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2439
2440 w -= 4;
2441 dst += 4;
2442 }
2443
2444 while (w)
2445 {
2446 d = *dst;
2447 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2448 xmm_alpha,
2449 unpack_32_1x128 (d)));
2450 w--;
2451 }
2452
2453 }
2454 }
2455
2456 static void
2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2458 pixman_composite_info_t *info)
2459 {
2460 PIXMAN_COMPOSITE_ARGS (info);
2461 uint32_t src;
2462 uint16_t *dst_line, *dst, d;
2463 int32_t w;
2464 int dst_stride;
2465 __m128i xmm_src, xmm_alpha;
2466 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2467
2468 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2469
2470 if (src == 0)
2471 return;
2472
2473 PIXMAN_IMAGE_GET_LINE (
2474 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2475
2476 xmm_src = expand_pixel_32_1x128 (src);
2477 xmm_alpha = expand_alpha_1x128 (xmm_src);
2478
2479 while (height--)
2480 {
2481 dst = dst_line;
2482
2483 dst_line += dst_stride;
2484 w = width;
2485
2486 while (w && (uintptr_t)dst & 15)
2487 {
2488 d = *dst;
2489
2490 *dst++ = pack_565_32_16 (
2491 pack_1x128_32 (over_1x128 (xmm_src,
2492 xmm_alpha,
2493 expand565_16_1x128 (d))));
2494 w--;
2495 }
2496
2497 while (w >= 8)
2498 {
2499 xmm_dst = load_128_aligned ((__m128i*)dst);
2500
2501 unpack_565_128_4x128 (xmm_dst,
2502 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2503
2504 over_2x128 (&xmm_src, &xmm_src,
2505 &xmm_alpha, &xmm_alpha,
2506 &xmm_dst0, &xmm_dst1);
2507 over_2x128 (&xmm_src, &xmm_src,
2508 &xmm_alpha, &xmm_alpha,
2509 &xmm_dst2, &xmm_dst3);
2510
2511 xmm_dst = pack_565_4x128_128 (
2512 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2513
2514 save_128_aligned ((__m128i*)dst, xmm_dst);
2515
2516 dst += 8;
2517 w -= 8;
2518 }
2519
2520 while (w--)
2521 {
2522 d = *dst;
2523 *dst++ = pack_565_32_16 (
2524 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2525 expand565_16_1x128 (d))));
2526 }
2527 }
2528
2529 }
2530
2531 static void
2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2533 pixman_composite_info_t *info)
2534 {
2535 PIXMAN_COMPOSITE_ARGS (info);
2536 uint32_t src;
2537 uint32_t *dst_line, d;
2538 uint32_t *mask_line, m;
2539 uint32_t pack_cmp;
2540 int dst_stride, mask_stride;
2541
2542 __m128i xmm_src;
2543 __m128i xmm_dst;
2544 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2545
2546 __m128i mmx_src, mmx_mask, mmx_dest;
2547
2548 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2549
2550 if (src == 0)
2551 return;
2552
2553 PIXMAN_IMAGE_GET_LINE (
2554 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2555 PIXMAN_IMAGE_GET_LINE (
2556 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2557
2558 xmm_src = _mm_unpacklo_epi8 (
2559 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2560 mmx_src = xmm_src;
2561
2562 while (height--)
2563 {
2564 int w = width;
2565 const uint32_t *pm = (uint32_t *)mask_line;
2566 uint32_t *pd = (uint32_t *)dst_line;
2567
2568 dst_line += dst_stride;
2569 mask_line += mask_stride;
2570
2571 while (w && (uintptr_t)pd & 15)
2572 {
2573 m = *pm++;
2574
2575 if (m)
2576 {
2577 d = *pd;
2578
2579 mmx_mask = unpack_32_1x128 (m);
2580 mmx_dest = unpack_32_1x128 (d);
2581
2582 *pd = pack_1x128_32 (
2583 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2584 mmx_dest));
2585 }
2586
2587 pd++;
2588 w--;
2589 }
2590
2591 while (w >= 4)
2592 {
2593 xmm_mask = load_128_unaligned ((__m128i*)pm);
2594
2595 pack_cmp =
2596 _mm_movemask_epi8 (
2597 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2598
2599 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2600 if (pack_cmp != 0xffff)
2601 {
2602 xmm_dst = load_128_aligned ((__m128i*)pd);
2603
2604 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2605
2606 pix_multiply_2x128 (&xmm_src, &xmm_src,
2607 &xmm_mask_lo, &xmm_mask_hi,
2608 &xmm_mask_lo, &xmm_mask_hi);
2609 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2610
2611 save_128_aligned (
2612 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2613 }
2614
2615 pd += 4;
2616 pm += 4;
2617 w -= 4;
2618 }
2619
2620 while (w)
2621 {
2622 m = *pm++;
2623
2624 if (m)
2625 {
2626 d = *pd;
2627
2628 mmx_mask = unpack_32_1x128 (m);
2629 mmx_dest = unpack_32_1x128 (d);
2630
2631 *pd = pack_1x128_32 (
2632 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2633 mmx_dest));
2634 }
2635
2636 pd++;
2637 w--;
2638 }
2639 }
2640
2641 }
2642
2643 static void
2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2645 pixman_composite_info_t *info)
2646 {
2647 PIXMAN_COMPOSITE_ARGS (info);
2648 uint32_t src;
2649 uint32_t *dst_line, d;
2650 uint32_t *mask_line, m;
2651 uint32_t pack_cmp;
2652 int dst_stride, mask_stride;
2653
2654 __m128i xmm_src, xmm_alpha;
2655 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2656 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2657
2658 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2659
2660 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2661
2662 if (src == 0)
2663 return;
2664
2665 PIXMAN_IMAGE_GET_LINE (
2666 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2667 PIXMAN_IMAGE_GET_LINE (
2668 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2669
2670 xmm_src = _mm_unpacklo_epi8 (
2671 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2672 xmm_alpha = expand_alpha_1x128 (xmm_src);
2673 mmx_src = xmm_src;
2674 mmx_alpha = xmm_alpha;
2675
2676 while (height--)
2677 {
2678 int w = width;
2679 const uint32_t *pm = (uint32_t *)mask_line;
2680 uint32_t *pd = (uint32_t *)dst_line;
2681
2682 dst_line += dst_stride;
2683 mask_line += mask_stride;
2684
2685 while (w && (uintptr_t)pd & 15)
2686 {
2687 m = *pm++;
2688
2689 if (m)
2690 {
2691 d = *pd;
2692 mmx_mask = unpack_32_1x128 (m);
2693 mmx_dest = unpack_32_1x128 (d);
2694
2695 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2696 &mmx_alpha,
2697 &mmx_mask,
2698 &mmx_dest));
2699 }
2700
2701 pd++;
2702 w--;
2703 }
2704
2705 while (w >= 4)
2706 {
2707 xmm_mask = load_128_unaligned ((__m128i*)pm);
2708
2709 pack_cmp =
2710 _mm_movemask_epi8 (
2711 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2712
2713 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2714 if (pack_cmp != 0xffff)
2715 {
2716 xmm_dst = load_128_aligned ((__m128i*)pd);
2717
2718 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2719 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2720
2721 in_over_2x128 (&xmm_src, &xmm_src,
2722 &xmm_alpha, &xmm_alpha,
2723 &xmm_mask_lo, &xmm_mask_hi,
2724 &xmm_dst_lo, &xmm_dst_hi);
2725
2726 save_128_aligned (
2727 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2728 }
2729
2730 pd += 4;
2731 pm += 4;
2732 w -= 4;
2733 }
2734
2735 while (w)
2736 {
2737 m = *pm++;
2738
2739 if (m)
2740 {
2741 d = *pd;
2742 mmx_mask = unpack_32_1x128 (m);
2743 mmx_dest = unpack_32_1x128 (d);
2744
2745 *pd = pack_1x128_32 (
2746 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2747 }
2748
2749 pd++;
2750 w--;
2751 }
2752 }
2753
2754 }
2755
2756 static void
2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2758 pixman_composite_info_t *info)
2759 {
2760 PIXMAN_COMPOSITE_ARGS (info);
2761 uint32_t *dst_line, *dst;
2762 uint32_t *src_line, *src;
2763 uint32_t mask;
2764 int32_t w;
2765 int dst_stride, src_stride;
2766
2767 __m128i xmm_mask;
2768 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2769 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2770 __m128i xmm_alpha_lo, xmm_alpha_hi;
2771
2772 PIXMAN_IMAGE_GET_LINE (
2773 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2774 PIXMAN_IMAGE_GET_LINE (
2775 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2776
2777 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2778
2779 xmm_mask = create_mask_16_128 (mask >> 24);
2780
2781 while (height--)
2782 {
2783 dst = dst_line;
2784 dst_line += dst_stride;
2785 src = src_line;
2786 src_line += src_stride;
2787 w = width;
2788
2789 while (w && (uintptr_t)dst & 15)
2790 {
2791 uint32_t s = *src++;
2792
2793 if (s)
2794 {
2795 uint32_t d = *dst;
2796
2797 __m128i ms = unpack_32_1x128 (s);
2798 __m128i alpha = expand_alpha_1x128 (ms);
2799 __m128i dest = xmm_mask;
2800 __m128i alpha_dst = unpack_32_1x128 (d);
2801
2802 *dst = pack_1x128_32 (
2803 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2804 }
2805 dst++;
2806 w--;
2807 }
2808
2809 while (w >= 4)
2810 {
2811 xmm_src = load_128_unaligned ((__m128i*)src);
2812
2813 if (!is_zero (xmm_src))
2814 {
2815 xmm_dst = load_128_aligned ((__m128i*)dst);
2816
2817 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2818 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2819 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2820 &xmm_alpha_lo, &xmm_alpha_hi);
2821
2822 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2823 &xmm_alpha_lo, &xmm_alpha_hi,
2824 &xmm_mask, &xmm_mask,
2825 &xmm_dst_lo, &xmm_dst_hi);
2826
2827 save_128_aligned (
2828 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2829 }
2830
2831 dst += 4;
2832 src += 4;
2833 w -= 4;
2834 }
2835
2836 while (w)
2837 {
2838 uint32_t s = *src++;
2839
2840 if (s)
2841 {
2842 uint32_t d = *dst;
2843
2844 __m128i ms = unpack_32_1x128 (s);
2845 __m128i alpha = expand_alpha_1x128 (ms);
2846 __m128i mask = xmm_mask;
2847 __m128i dest = unpack_32_1x128 (d);
2848
2849 *dst = pack_1x128_32 (
2850 in_over_1x128 (&ms, &alpha, &mask, &dest));
2851 }
2852
2853 dst++;
2854 w--;
2855 }
2856 }
2857
2858 }
2859
2860 static void
2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2862 pixman_composite_info_t *info)
2863 {
2864 PIXMAN_COMPOSITE_ARGS (info);
2865 uint16_t *dst_line, *dst;
2866 uint32_t *src_line, *src, s;
2867 int dst_stride, src_stride;
2868 int32_t w;
2869
2870 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2871 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2872
2873 while (height--)
2874 {
2875 dst = dst_line;
2876 dst_line += dst_stride;
2877 src = src_line;
2878 src_line += src_stride;
2879 w = width;
2880
2881 while (w && (uintptr_t)dst & 15)
2882 {
2883 s = *src++;
2884 *dst = convert_8888_to_0565 (s);
2885 dst++;
2886 w--;
2887 }
2888
2889 while (w >= 8)
2890 {
2891 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2892 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2893
2894 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2895
2896 w -= 8;
2897 src += 8;
2898 dst += 8;
2899 }
2900
2901 while (w)
2902 {
2903 s = *src++;
2904 *dst = convert_8888_to_0565 (s);
2905 dst++;
2906 w--;
2907 }
2908 }
2909 }
2910
2911 static void
2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2913 pixman_composite_info_t *info)
2914 {
2915 PIXMAN_COMPOSITE_ARGS (info);
2916 uint32_t *dst_line, *dst;
2917 uint32_t *src_line, *src;
2918 int32_t w;
2919 int dst_stride, src_stride;
2920
2921
2922 PIXMAN_IMAGE_GET_LINE (
2923 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2924 PIXMAN_IMAGE_GET_LINE (
2925 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2926
2927 while (height--)
2928 {
2929 dst = dst_line;
2930 dst_line += dst_stride;
2931 src = src_line;
2932 src_line += src_stride;
2933 w = width;
2934
2935 while (w && (uintptr_t)dst & 15)
2936 {
2937 *dst++ = *src++ | 0xff000000;
2938 w--;
2939 }
2940
2941 while (w >= 16)
2942 {
2943 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2944
2945 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2946 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2947 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2948 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2949
2950 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2951 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2952 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2953 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2954
2955 dst += 16;
2956 src += 16;
2957 w -= 16;
2958 }
2959
2960 while (w)
2961 {
2962 *dst++ = *src++ | 0xff000000;
2963 w--;
2964 }
2965 }
2966
2967 }
2968
2969 static void
2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2971 pixman_composite_info_t *info)
2972 {
2973 PIXMAN_COMPOSITE_ARGS (info);
2974 uint32_t *dst_line, *dst;
2975 uint32_t *src_line, *src;
2976 uint32_t mask;
2977 int dst_stride, src_stride;
2978 int32_t w;
2979
2980 __m128i xmm_mask, xmm_alpha;
2981 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2982 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2983
2984 PIXMAN_IMAGE_GET_LINE (
2985 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2986 PIXMAN_IMAGE_GET_LINE (
2987 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2988
2989 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2990
2991 xmm_mask = create_mask_16_128 (mask >> 24);
2992 xmm_alpha = mask_00ff;
2993
2994 while (height--)
2995 {
2996 dst = dst_line;
2997 dst_line += dst_stride;
2998 src = src_line;
2999 src_line += src_stride;
3000 w = width;
3001
3002 while (w && (uintptr_t)dst & 15)
3003 {
3004 uint32_t s = (*src++) | 0xff000000;
3005 uint32_t d = *dst;
3006
3007 __m128i src = unpack_32_1x128 (s);
3008 __m128i alpha = xmm_alpha;
3009 __m128i mask = xmm_mask;
3010 __m128i dest = unpack_32_1x128 (d);
3011
3012 *dst++ = pack_1x128_32 (
3013 in_over_1x128 (&src, &alpha, &mask, &dest));
3014
3015 w--;
3016 }
3017
3018 while (w >= 4)
3019 {
3020 xmm_src = _mm_or_si128 (
3021 load_128_unaligned ((__m128i*)src), mask_ff000000);
3022 xmm_dst = load_128_aligned ((__m128i*)dst);
3023
3024 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3025 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3026
3027 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3028 &xmm_alpha, &xmm_alpha,
3029 &xmm_mask, &xmm_mask,
3030 &xmm_dst_lo, &xmm_dst_hi);
3031
3032 save_128_aligned (
3033 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3034
3035 dst += 4;
3036 src += 4;
3037 w -= 4;
3038
3039 }
3040
3041 while (w)
3042 {
3043 uint32_t s = (*src++) | 0xff000000;
3044 uint32_t d = *dst;
3045
3046 __m128i src = unpack_32_1x128 (s);
3047 __m128i alpha = xmm_alpha;
3048 __m128i mask = xmm_mask;
3049 __m128i dest = unpack_32_1x128 (d);
3050
3051 *dst++ = pack_1x128_32 (
3052 in_over_1x128 (&src, &alpha, &mask, &dest));
3053
3054 w--;
3055 }
3056 }
3057
3058 }
3059
3060 static void
3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3062 pixman_composite_info_t *info)
3063 {
3064 PIXMAN_COMPOSITE_ARGS (info);
3065 int dst_stride, src_stride;
3066 uint32_t *dst_line, *dst;
3067 uint32_t *src_line, *src;
3068
3069 PIXMAN_IMAGE_GET_LINE (
3070 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3071 PIXMAN_IMAGE_GET_LINE (
3072 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3073
3074 dst = dst_line;
3075 src = src_line;
3076
3077 while (height--)
3078 {
3079 sse2_combine_over_u (imp, op, dst, src, NULL, width);
3080
3081 dst += dst_stride;
3082 src += src_stride;
3083 }
3084 }
3085
3086 static force_inline uint16_t
3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3088 {
3089 __m128i ms;
3090
3091 ms = unpack_32_1x128 (src);
3092 return pack_565_32_16 (
3093 pack_1x128_32 (
3094 over_1x128 (
3095 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3096 }
3097
3098 static void
3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3100 pixman_composite_info_t *info)
3101 {
3102 PIXMAN_COMPOSITE_ARGS (info);
3103 uint16_t *dst_line, *dst, d;
3104 uint32_t *src_line, *src, s;
3105 int dst_stride, src_stride;
3106 int32_t w;
3107
3108 __m128i xmm_alpha_lo, xmm_alpha_hi;
3109 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3110 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3111
3112 PIXMAN_IMAGE_GET_LINE (
3113 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3114 PIXMAN_IMAGE_GET_LINE (
3115 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3116
3117 while (height--)
3118 {
3119 dst = dst_line;
3120 src = src_line;
3121
3122 dst_line += dst_stride;
3123 src_line += src_stride;
3124 w = width;
3125
3126 /* Align dst on a 16-byte boundary */
3127 while (w &&
3128 ((uintptr_t)dst & 15))
3129 {
3130 s = *src++;
3131 d = *dst;
3132
3133 *dst++ = composite_over_8888_0565pixel (s, d);
3134 w--;
3135 }
3136
3137 /* It's a 8 pixel loop */
3138 while (w >= 8)
3139 {
3140 /* I'm loading unaligned because I'm not sure
3141 * about the address alignment.
3142 */
3143 xmm_src = load_128_unaligned ((__m128i*) src);
3144 xmm_dst = load_128_aligned ((__m128i*) dst);
3145
3146 /* Unpacking */
3147 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3148 unpack_565_128_4x128 (xmm_dst,
3149 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3151 &xmm_alpha_lo, &xmm_alpha_hi);
3152
3153 /* I'm loading next 4 pixels from memory
3154 * before to optimze the memory read.
3155 */
3156 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3157
3158 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3159 &xmm_alpha_lo, &xmm_alpha_hi,
3160 &xmm_dst0, &xmm_dst1);
3161
3162 /* Unpacking */
3163 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3165 &xmm_alpha_lo, &xmm_alpha_hi);
3166
3167 over_2x128 (&xmm_src_lo, &xmm_src_hi,
3168 &xmm_alpha_lo, &xmm_alpha_hi,
3169 &xmm_dst2, &xmm_dst3);
3170
3171 save_128_aligned (
3172 (__m128i*)dst, pack_565_4x128_128 (
3173 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3174
3175 w -= 8;
3176 dst += 8;
3177 src += 8;
3178 }
3179
3180 while (w--)
3181 {
3182 s = *src++;
3183 d = *dst;
3184
3185 *dst++ = composite_over_8888_0565pixel (s, d);
3186 }
3187 }
3188
3189 }
3190
3191 static void
3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3193 pixman_composite_info_t *info)
3194 {
3195 PIXMAN_COMPOSITE_ARGS (info);
3196 uint32_t src, srca;
3197 uint32_t *dst_line, *dst;
3198 uint8_t *mask_line, *mask;
3199 int dst_stride, mask_stride;
3200 int32_t w;
3201 uint32_t m, d;
3202
3203 __m128i xmm_src, xmm_alpha, xmm_def;
3204 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3205 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3206
3207 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3208
3209 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3210
3211 srca = src >> 24;
3212 if (src == 0)
3213 return;
3214
3215 PIXMAN_IMAGE_GET_LINE (
3216 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3217 PIXMAN_IMAGE_GET_LINE (
3218 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3219
3220 xmm_def = create_mask_2x32_128 (src, src);
3221 xmm_src = expand_pixel_32_1x128 (src);
3222 xmm_alpha = expand_alpha_1x128 (xmm_src);
3223 mmx_src = xmm_src;
3224 mmx_alpha = xmm_alpha;
3225
3226 while (height--)
3227 {
3228 dst = dst_line;
3229 dst_line += dst_stride;
3230 mask = mask_line;
3231 mask_line += mask_stride;
3232 w = width;
3233
3234 while (w && (uintptr_t)dst & 15)
3235 {
3236 uint8_t m = *mask++;
3237
3238 if (m)
3239 {
3240 d = *dst;
3241 mmx_mask = expand_pixel_8_1x128 (m);
3242 mmx_dest = unpack_32_1x128 (d);
3243
3244 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3245 &mmx_alpha,
3246 &mmx_mask,
3247 &mmx_dest));
3248 }
3249
3250 w--;
3251 dst++;
3252 }
3253
3254 while (w >= 4)
3255 {
3256 m = *((uint32_t*)mask);
3257
3258 if (srca == 0xff && m == 0xffffffff)
3259 {
3260 save_128_aligned ((__m128i*)dst, xmm_def);
3261 }
3262 else if (m)
3263 {
3264 xmm_dst = load_128_aligned ((__m128i*) dst);
3265 xmm_mask = unpack_32_1x128 (m);
3266 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3267
3268 /* Unpacking */
3269 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3270 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3271
3272 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3273 &xmm_mask_lo, &xmm_mask_hi);
3274
3275 in_over_2x128 (&xmm_src, &xmm_src,
3276 &xmm_alpha, &xmm_alpha,
3277 &xmm_mask_lo, &xmm_mask_hi,
3278 &xmm_dst_lo, &xmm_dst_hi);
3279
3280 save_128_aligned (
3281 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3282 }
3283
3284 w -= 4;
3285 dst += 4;
3286 mask += 4;
3287 }
3288
3289 while (w)
3290 {
3291 uint8_t m = *mask++;
3292
3293 if (m)
3294 {
3295 d = *dst;
3296 mmx_mask = expand_pixel_8_1x128 (m);
3297 mmx_dest = unpack_32_1x128 (d);
3298
3299 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3300 &mmx_alpha,
3301 &mmx_mask,
3302 &mmx_dest));
3303 }
3304
3305 w--;
3306 dst++;
3307 }
3308 }
3309
3310 }
3311
3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3313 __attribute__((__force_align_arg_pointer__))
3314 #endif
3315 static pixman_bool_t
3316 sse2_fill (pixman_implementation_t *imp,
3317 uint32_t * bits,
3318 int stride,
3319 int bpp,
3320 int x,
3321 int y,
3322 int width,
3323 int height,
3324 uint32_t filler)
3325 {
3326 uint32_t byte_width;
3327 uint8_t *byte_line;
3328
3329 __m128i xmm_def;
3330
3331 if (bpp == 8)
3332 {
3333 uint8_t b;
3334 uint16_t w;
3335
3336 stride = stride * (int) sizeof (uint32_t) / 1;
3337 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3338 byte_width = width;
3339 stride *= 1;
3340
3341 b = filler & 0xff;
3342 w = (b << 8) | b;
3343 filler = (w << 16) | w;
3344 }
3345 else if (bpp == 16)
3346 {
3347 stride = stride * (int) sizeof (uint32_t) / 2;
3348 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3349 byte_width = 2 * width;
3350 stride *= 2;
3351
3352 filler = (filler & 0xffff) * 0x00010001;
3353 }
3354 else if (bpp == 32)
3355 {
3356 stride = stride * (int) sizeof (uint32_t) / 4;
3357 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3358 byte_width = 4 * width;
3359 stride *= 4;
3360 }
3361 else
3362 {
3363 return FALSE;
3364 }
3365
3366 xmm_def = create_mask_2x32_128 (filler, filler);
3367
3368 while (height--)
3369 {
3370 int w;
3371 uint8_t *d = byte_line;
3372 byte_line += stride;
3373 w = byte_width;
3374
3375 if (w >= 1 && ((uintptr_t)d & 1))
3376 {
3377 *(uint8_t *)d = filler;
3378 w -= 1;
3379 d += 1;
3380 }
3381
3382 while (w >= 2 && ((uintptr_t)d & 3))
3383 {
3384 *(uint16_t *)d = filler;
3385 w -= 2;
3386 d += 2;
3387 }
3388
3389 while (w >= 4 && ((uintptr_t)d & 15))
3390 {
3391 *(uint32_t *)d = filler;
3392
3393 w -= 4;
3394 d += 4;
3395 }
3396
3397 while (w >= 128)
3398 {
3399 save_128_aligned ((__m128i*)(d), xmm_def);
3400 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3401 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3402 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3403 save_128_aligned ((__m128i*)(d + 64), xmm_def);
3404 save_128_aligned ((__m128i*)(d + 80), xmm_def);
3405 save_128_aligned ((__m128i*)(d + 96), xmm_def);
3406 save_128_aligned ((__m128i*)(d + 112), xmm_def);
3407
3408 d += 128;
3409 w -= 128;
3410 }
3411
3412 if (w >= 64)
3413 {
3414 save_128_aligned ((__m128i*)(d), xmm_def);
3415 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3416 save_128_aligned ((__m128i*)(d + 32), xmm_def);
3417 save_128_aligned ((__m128i*)(d + 48), xmm_def);
3418
3419 d += 64;
3420 w -= 64;
3421 }
3422
3423 if (w >= 32)
3424 {
3425 save_128_aligned ((__m128i*)(d), xmm_def);
3426 save_128_aligned ((__m128i*)(d + 16), xmm_def);
3427
3428 d += 32;
3429 w -= 32;
3430 }
3431
3432 if (w >= 16)
3433 {
3434 save_128_aligned ((__m128i*)(d), xmm_def);
3435
3436 d += 16;
3437 w -= 16;
3438 }
3439
3440 while (w >= 4)
3441 {
3442 *(uint32_t *)d = filler;
3443
3444 w -= 4;
3445 d += 4;
3446 }
3447
3448 if (w >= 2)
3449 {
3450 *(uint16_t *)d = filler;
3451 w -= 2;
3452 d += 2;
3453 }
3454
3455 if (w >= 1)
3456 {
3457 *(uint8_t *)d = filler;
3458 w -= 1;
3459 d += 1;
3460 }
3461 }
3462
3463 return TRUE;
3464 }
3465
3466 static void
3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3468 pixman_composite_info_t *info)
3469 {
3470 PIXMAN_COMPOSITE_ARGS (info);
3471 uint32_t src, srca;
3472 uint32_t *dst_line, *dst;
3473 uint8_t *mask_line, *mask;
3474 int dst_stride, mask_stride;
3475 int32_t w;
3476 uint32_t m;
3477
3478 __m128i xmm_src, xmm_def;
3479 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3480
3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3482
3483 srca = src >> 24;
3484 if (src == 0)
3485 {
3486 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3487 PIXMAN_FORMAT_BPP (dest_image->bits.format),
3488 dest_x, dest_y, width, height, 0);
3489 return;
3490 }
3491
3492 PIXMAN_IMAGE_GET_LINE (
3493 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3494 PIXMAN_IMAGE_GET_LINE (
3495 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3496
3497 xmm_def = create_mask_2x32_128 (src, src);
3498 xmm_src = expand_pixel_32_1x128 (src);
3499
3500 while (height--)
3501 {
3502 dst = dst_line;
3503 dst_line += dst_stride;
3504 mask = mask_line;
3505 mask_line += mask_stride;
3506 w = width;
3507
3508 while (w && (uintptr_t)dst & 15)
3509 {
3510 uint8_t m = *mask++;
3511
3512 if (m)
3513 {
3514 *dst = pack_1x128_32 (
3515 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3516 }
3517 else
3518 {
3519 *dst = 0;
3520 }
3521
3522 w--;
3523 dst++;
3524 }
3525
3526 while (w >= 4)
3527 {
3528 m = *((uint32_t*)mask);
3529
3530 if (srca == 0xff && m == 0xffffffff)
3531 {
3532 save_128_aligned ((__m128i*)dst, xmm_def);
3533 }
3534 else if (m)
3535 {
3536 xmm_mask = unpack_32_1x128 (m);
3537 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3538
3539 /* Unpacking */
3540 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3541
3542 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3543 &xmm_mask_lo, &xmm_mask_hi);
3544
3545 pix_multiply_2x128 (&xmm_src, &xmm_src,
3546 &xmm_mask_lo, &xmm_mask_hi,
3547 &xmm_mask_lo, &xmm_mask_hi);
3548
3549 save_128_aligned (
3550 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3551 }
3552 else
3553 {
3554 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3555 }
3556
3557 w -= 4;
3558 dst += 4;
3559 mask += 4;
3560 }
3561
3562 while (w)
3563 {
3564 uint8_t m = *mask++;
3565
3566 if (m)
3567 {
3568 *dst = pack_1x128_32 (
3569 pix_multiply_1x128 (
3570 xmm_src, expand_pixel_8_1x128 (m)));
3571 }
3572 else
3573 {
3574 *dst = 0;
3575 }
3576
3577 w--;
3578 dst++;
3579 }
3580 }
3581
3582 }
3583
3584 static void
3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3586 pixman_composite_info_t *info)
3587 {
3588 PIXMAN_COMPOSITE_ARGS (info);
3589 uint32_t src;
3590 uint16_t *dst_line, *dst, d;
3591 uint8_t *mask_line, *mask;
3592 int dst_stride, mask_stride;
3593 int32_t w;
3594 uint32_t m;
3595 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3596
3597 __m128i xmm_src, xmm_alpha;
3598 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3599 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3600
3601 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3602
3603 if (src == 0)
3604 return;
3605
3606 PIXMAN_IMAGE_GET_LINE (
3607 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3608 PIXMAN_IMAGE_GET_LINE (
3609 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3610
3611 xmm_src = expand_pixel_32_1x128 (src);
3612 xmm_alpha = expand_alpha_1x128 (xmm_src);
3613 mmx_src = xmm_src;
3614 mmx_alpha = xmm_alpha;
3615
3616 while (height--)
3617 {
3618 dst = dst_line;
3619 dst_line += dst_stride;
3620 mask = mask_line;
3621 mask_line += mask_stride;
3622 w = width;
3623
3624 while (w && (uintptr_t)dst & 15)
3625 {
3626 m = *mask++;
3627
3628 if (m)
3629 {
3630 d = *dst;
3631 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3632 mmx_dest = expand565_16_1x128 (d);
3633
3634 *dst = pack_565_32_16 (
3635 pack_1x128_32 (
3636 in_over_1x128 (
3637 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3638 }
3639
3640 w--;
3641 dst++;
3642 }
3643
3644 while (w >= 8)
3645 {
3646 xmm_dst = load_128_aligned ((__m128i*) dst);
3647 unpack_565_128_4x128 (xmm_dst,
3648 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3649
3650 m = *((uint32_t*)mask);
3651 mask += 4;
3652
3653 if (m)
3654 {
3655 xmm_mask = unpack_32_1x128 (m);
3656 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3657
3658 /* Unpacking */
3659 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3660
3661 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3662 &xmm_mask_lo, &xmm_mask_hi);
3663
3664 in_over_2x128 (&xmm_src, &xmm_src,
3665 &xmm_alpha, &xmm_alpha,
3666 &xmm_mask_lo, &xmm_mask_hi,
3667 &xmm_dst0, &xmm_dst1);
3668 }
3669
3670 m = *((uint32_t*)mask);
3671 mask += 4;
3672
3673 if (m)
3674 {
3675 xmm_mask = unpack_32_1x128 (m);
3676 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3677
3678 /* Unpacking */
3679 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3680
3681 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3682 &xmm_mask_lo, &xmm_mask_hi);
3683 in_over_2x128 (&xmm_src, &xmm_src,
3684 &xmm_alpha, &xmm_alpha,
3685 &xmm_mask_lo, &xmm_mask_hi,
3686 &xmm_dst2, &xmm_dst3);
3687 }
3688
3689 save_128_aligned (
3690 (__m128i*)dst, pack_565_4x128_128 (
3691 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3692
3693 w -= 8;
3694 dst += 8;
3695 }
3696
3697 while (w)
3698 {
3699 m = *mask++;
3700
3701 if (m)
3702 {
3703 d = *dst;
3704 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3705 mmx_dest = expand565_16_1x128 (d);
3706
3707 *dst = pack_565_32_16 (
3708 pack_1x128_32 (
3709 in_over_1x128 (
3710 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3711 }
3712
3713 w--;
3714 dst++;
3715 }
3716 }
3717
3718 }
3719
3720 static void
3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3722 pixman_composite_info_t *info)
3723 {
3724 PIXMAN_COMPOSITE_ARGS (info);
3725 uint16_t *dst_line, *dst, d;
3726 uint32_t *src_line, *src, s;
3727 int dst_stride, src_stride;
3728 int32_t w;
3729 uint32_t opaque, zero;
3730
3731 __m128i ms;
3732 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3733 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3734
3735 PIXMAN_IMAGE_GET_LINE (
3736 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3737 PIXMAN_IMAGE_GET_LINE (
3738 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3739
3740 while (height--)
3741 {
3742 dst = dst_line;
3743 dst_line += dst_stride;
3744 src = src_line;
3745 src_line += src_stride;
3746 w = width;
3747
3748 while (w && (uintptr_t)dst & 15)
3749 {
3750 s = *src++;
3751 d = *dst;
3752
3753 ms = unpack_32_1x128 (s);
3754
3755 *dst++ = pack_565_32_16 (
3756 pack_1x128_32 (
3757 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3758 w--;
3759 }
3760
3761 while (w >= 8)
3762 {
3763 /* First round */
3764 xmm_src = load_128_unaligned ((__m128i*)src);
3765 xmm_dst = load_128_aligned ((__m128i*)dst);
3766
3767 opaque = is_opaque (xmm_src);
3768 zero = is_zero (xmm_src);
3769
3770 unpack_565_128_4x128 (xmm_dst,
3771 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3772 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773
3774 /* preload next round*/
3775 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3776
3777 if (opaque)
3778 {
3779 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3780 &xmm_dst0, &xmm_dst1);
3781 }
3782 else if (!zero)
3783 {
3784 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3785 &xmm_dst0, &xmm_dst1);
3786 }
3787
3788 /* Second round */
3789 opaque = is_opaque (xmm_src);
3790 zero = is_zero (xmm_src);
3791
3792 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3793
3794 if (opaque)
3795 {
3796 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3797 &xmm_dst2, &xmm_dst3);
3798 }
3799 else if (!zero)
3800 {
3801 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3802 &xmm_dst2, &xmm_dst3);
3803 }
3804
3805 save_128_aligned (
3806 (__m128i*)dst, pack_565_4x128_128 (
3807 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3808
3809 w -= 8;
3810 src += 8;
3811 dst += 8;
3812 }
3813
3814 while (w)
3815 {
3816 s = *src++;
3817 d = *dst;
3818
3819 ms = unpack_32_1x128 (s);
3820
3821 *dst++ = pack_565_32_16 (
3822 pack_1x128_32 (
3823 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3824 w--;
3825 }
3826 }
3827
3828 }
3829
3830 static void
3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3832 pixman_composite_info_t *info)
3833 {
3834 PIXMAN_COMPOSITE_ARGS (info);
3835 uint32_t *dst_line, *dst, d;
3836 uint32_t *src_line, *src, s;
3837 int dst_stride, src_stride;
3838 int32_t w;
3839 uint32_t opaque, zero;
3840
3841 __m128i xmm_src_lo, xmm_src_hi;
3842 __m128i xmm_dst_lo, xmm_dst_hi;
3843
3844 PIXMAN_IMAGE_GET_LINE (
3845 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3846 PIXMAN_IMAGE_GET_LINE (
3847 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3848
3849 while (height--)
3850 {
3851 dst = dst_line;
3852 dst_line += dst_stride;
3853 src = src_line;
3854 src_line += src_stride;
3855 w = width;
3856
3857 while (w && (uintptr_t)dst & 15)
3858 {
3859 s = *src++;
3860 d = *dst;
3861
3862 *dst++ = pack_1x128_32 (
3863 over_rev_non_pre_1x128 (
3864 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3865
3866 w--;
3867 }
3868
3869 while (w >= 4)
3870 {
3871 xmm_src_hi = load_128_unaligned ((__m128i*)src);
3872
3873 opaque = is_opaque (xmm_src_hi);
3874 zero = is_zero (xmm_src_hi);
3875
3876 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3877
3878 if (opaque)
3879 {
3880 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3881 &xmm_dst_lo, &xmm_dst_hi);
3882
3883 save_128_aligned (
3884 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3885 }
3886 else if (!zero)
3887 {
3888 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
3889
3890 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3891
3892 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3893 &xmm_dst_lo, &xmm_dst_hi);
3894
3895 save_128_aligned (
3896 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3897 }
3898
3899 w -= 4;
3900 dst += 4;
3901 src += 4;
3902 }
3903
3904 while (w)
3905 {
3906 s = *src++;
3907 d = *dst;
3908
3909 *dst++ = pack_1x128_32 (
3910 over_rev_non_pre_1x128 (
3911 unpack_32_1x128 (s), unpack_32_1x128 (d)));
3912
3913 w--;
3914 }
3915 }
3916
3917 }
3918
3919 static void
3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3921 pixman_composite_info_t *info)
3922 {
3923 PIXMAN_COMPOSITE_ARGS (info);
3924 uint32_t src;
3925 uint16_t *dst_line, *dst, d;
3926 uint32_t *mask_line, *mask, m;
3927 int dst_stride, mask_stride;
3928 int w;
3929 uint32_t pack_cmp;
3930
3931 __m128i xmm_src, xmm_alpha;
3932 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3933 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3934
3935 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3936
3937 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3938
3939 if (src == 0)
3940 return;
3941
3942 PIXMAN_IMAGE_GET_LINE (
3943 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3944 PIXMAN_IMAGE_GET_LINE (
3945 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3946
3947 xmm_src = expand_pixel_32_1x128 (src);
3948 xmm_alpha = expand_alpha_1x128 (xmm_src);
3949 mmx_src = xmm_src;
3950 mmx_alpha = xmm_alpha;
3951
3952 while (height--)
3953 {
3954 w = width;
3955 mask = mask_line;
3956 dst = dst_line;
3957 mask_line += mask_stride;
3958 dst_line += dst_stride;
3959
3960 while (w && ((uintptr_t)dst & 15))
3961 {
3962 m = *(uint32_t *) mask;
3963
3964 if (m)
3965 {
3966 d = *dst;
3967 mmx_mask = unpack_32_1x128 (m);
3968 mmx_dest = expand565_16_1x128 (d);
3969
3970 *dst = pack_565_32_16 (
3971 pack_1x128_32 (
3972 in_over_1x128 (
3973 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3974 }
3975
3976 w--;
3977 dst++;
3978 mask++;
3979 }
3980
3981 while (w >= 8)
3982 {
3983 /* First round */
3984 xmm_mask = load_128_unaligned ((__m128i*)mask);
3985 xmm_dst = load_128_aligned ((__m128i*)dst);
3986
3987 pack_cmp = _mm_movemask_epi8 (
3988 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3989
3990 unpack_565_128_4x128 (xmm_dst,
3991 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3992 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3993
3994 /* preload next round */
3995 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3996
3997 /* preload next round */
3998 if (pack_cmp != 0xffff)
3999 {
4000 in_over_2x128 (&xmm_src, &xmm_src,
4001 &xmm_alpha, &xmm_alpha,
4002 &xmm_mask_lo, &xmm_mask_hi,
4003 &xmm_dst0, &xmm_dst1);
4004 }
4005
4006 /* Second round */
4007 pack_cmp = _mm_movemask_epi8 (
4008 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4009
4010 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4011
4012 if (pack_cmp != 0xffff)
4013 {
4014 in_over_2x128 (&xmm_src, &xmm_src,
4015 &xmm_alpha, &xmm_alpha,
4016 &xmm_mask_lo, &xmm_mask_hi,
4017 &xmm_dst2, &xmm_dst3);
4018 }
4019
4020 save_128_aligned (
4021 (__m128i*)dst, pack_565_4x128_128 (
4022 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4023
4024 w -= 8;
4025 dst += 8;
4026 mask += 8;
4027 }
4028
4029 while (w)
4030 {
4031 m = *(uint32_t *) mask;
4032
4033 if (m)
4034 {
4035 d = *dst;
4036 mmx_mask = unpack_32_1x128 (m);
4037 mmx_dest = expand565_16_1x128 (d);
4038
4039 *dst = pack_565_32_16 (
4040 pack_1x128_32 (
4041 in_over_1x128 (
4042 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4043 }
4044
4045 w--;
4046 dst++;
4047 mask++;
4048 }
4049 }
4050
4051 }
4052
4053 static void
4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4055 pixman_composite_info_t *info)
4056 {
4057 PIXMAN_COMPOSITE_ARGS (info);
4058 uint8_t *dst_line, *dst;
4059 uint8_t *mask_line, *mask;
4060 int dst_stride, mask_stride;
4061 uint32_t d, m;
4062 uint32_t src;
4063 int32_t w;
4064
4065 __m128i xmm_alpha;
4066 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4067 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4068
4069 PIXMAN_IMAGE_GET_LINE (
4070 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4071 PIXMAN_IMAGE_GET_LINE (
4072 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4073
4074 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4075
4076 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4077
4078 while (height--)
4079 {
4080 dst = dst_line;
4081 dst_line += dst_stride;
4082 mask = mask_line;
4083 mask_line += mask_stride;
4084 w = width;
4085
4086 while (w && ((uintptr_t)dst & 15))
4087 {
4088 m = (uint32_t) *mask++;
4089 d = (uint32_t) *dst;
4090
4091 *dst++ = (uint8_t) pack_1x128_32 (
4092 pix_multiply_1x128 (
4093 pix_multiply_1x128 (xmm_alpha,
4094 unpack_32_1x128 (m)),
4095 unpack_32_1x128 (d)));
4096 w--;
4097 }
4098
4099 while (w >= 16)
4100 {
4101 xmm_mask = load_128_unaligned ((__m128i*)mask);
4102 xmm_dst = load_128_aligned ((__m128i*)dst);
4103
4104 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4105 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4106
4107 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4108 &xmm_mask_lo, &xmm_mask_hi,
4109 &xmm_mask_lo, &xmm_mask_hi);
4110
4111 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4112 &xmm_dst_lo, &xmm_dst_hi,
4113 &xmm_dst_lo, &xmm_dst_hi);
4114
4115 save_128_aligned (
4116 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4117
4118 mask += 16;
4119 dst += 16;
4120 w -= 16;
4121 }
4122
4123 while (w)
4124 {
4125 m = (uint32_t) *mask++;
4126 d = (uint32_t) *dst;
4127
4128 *dst++ = (uint8_t) pack_1x128_32 (
4129 pix_multiply_1x128 (
4130 pix_multiply_1x128 (
4131 xmm_alpha, unpack_32_1x128 (m)),
4132 unpack_32_1x128 (d)));
4133 w--;
4134 }
4135 }
4136
4137 }
4138
4139 static void
4140 sse2_composite_in_n_8 (pixman_implementation_t *imp,
4141 pixman_composite_info_t *info)
4142 {
4143 PIXMAN_COMPOSITE_ARGS (info);
4144 uint8_t *dst_line, *dst;
4145 int dst_stride;
4146 uint32_t d;
4147 uint32_t src;
4148 int32_t w;
4149
4150 __m128i xmm_alpha;
4151 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4152
4153 PIXMAN_IMAGE_GET_LINE (
4154 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4155
4156 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4157
4158 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4159
4160 src = src >> 24;
4161
4162 if (src == 0xff)
4163 return;
4164
4165 if (src == 0x00)
4166 {
4167 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4168 8, dest_x, dest_y, width, height, src);
4169
4170 return;
4171 }
4172
4173 while (height--)
4174 {
4175 dst = dst_line;
4176 dst_line += dst_stride;
4177 w = width;
4178
4179 while (w && ((uintptr_t)dst & 15))
4180 {
4181 d = (uint32_t) *dst;
4182
4183 *dst++ = (uint8_t) pack_1x128_32 (
4184 pix_multiply_1x128 (
4185 xmm_alpha,
4186 unpack_32_1x128 (d)));
4187 w--;
4188 }
4189
4190 while (w >= 16)
4191 {
4192 xmm_dst = load_128_aligned ((__m128i*)dst);
4193
4194 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4195
4196 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4197 &xmm_dst_lo, &xmm_dst_hi,
4198 &xmm_dst_lo, &xmm_dst_hi);
4199
4200 save_128_aligned (
4201 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4202
4203 dst += 16;
4204 w -= 16;
4205 }
4206
4207 while (w)
4208 {
4209 d = (uint32_t) *dst;
4210
4211 *dst++ = (uint8_t) pack_1x128_32 (
4212 pix_multiply_1x128 (
4213 xmm_alpha,
4214 unpack_32_1x128 (d)));
4215 w--;
4216 }
4217 }
4218
4219 }
4220
4221 static void
4222 sse2_composite_in_8_8 (pixman_implementation_t *imp,
4223 pixman_composite_info_t *info)
4224 {
4225 PIXMAN_COMPOSITE_ARGS (info);
4226 uint8_t *dst_line, *dst;
4227 uint8_t *src_line, *src;
4228 int src_stride, dst_stride;
4229 int32_t w;
4230 uint32_t s, d;
4231
4232 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4233 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4234
4235 PIXMAN_IMAGE_GET_LINE (
4236 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4237 PIXMAN_IMAGE_GET_LINE (
4238 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4239
4240 while (height--)
4241 {
4242 dst = dst_line;
4243 dst_line += dst_stride;
4244 src = src_line;
4245 src_line += src_stride;
4246 w = width;
4247
4248 while (w && ((uintptr_t)dst & 15))
4249 {
4250 s = (uint32_t) *src++;
4251 d = (uint32_t) *dst;
4252
4253 *dst++ = (uint8_t) pack_1x128_32 (
4254 pix_multiply_1x128 (
4255 unpack_32_1x128 (s), unpack_32_1x128 (d)));
4256 w--;
4257 }
4258
4259 while (w >= 16)
4260 {
4261 xmm_src = load_128_unaligned ((__m128i*)src);
4262 xmm_dst = load_128_aligned ((__m128i*)dst);
4263
4264 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4265 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4266
4267 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4268 &xmm_dst_lo, &xmm_dst_hi,
4269 &xmm_dst_lo, &xmm_dst_hi);
4270
4271 save_128_aligned (
4272 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4273
4274 src += 16;
4275 dst += 16;
4276 w -= 16;
4277 }
4278
4279 while (w)
4280 {
4281 s = (uint32_t) *src++;
4282 d = (uint32_t) *dst;
4283
4284 *dst++ = (uint8_t) pack_1x128_32 (
4285 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4286 w--;
4287 }
4288 }
4289
4290 }
4291
4292 static void
4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4294 pixman_composite_info_t *info)
4295 {
4296 PIXMAN_COMPOSITE_ARGS (info);
4297 uint8_t *dst_line, *dst;
4298 uint8_t *mask_line, *mask;
4299 int dst_stride, mask_stride;
4300 int32_t w;
4301 uint32_t src;
4302 uint32_t m, d;
4303
4304 __m128i xmm_alpha;
4305 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4306 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4307
4308 PIXMAN_IMAGE_GET_LINE (
4309 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4310 PIXMAN_IMAGE_GET_LINE (
4311 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4312
4313 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4314
4315 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4316
4317 while (height--)
4318 {
4319 dst = dst_line;
4320 dst_line += dst_stride;
4321 mask = mask_line;
4322 mask_line += mask_stride;
4323 w = width;
4324
4325 while (w && ((uintptr_t)dst & 15))
4326 {
4327 m = (uint32_t) *mask++;
4328 d = (uint32_t) *dst;
4329
4330 *dst++ = (uint8_t) pack_1x128_32 (
4331 _mm_adds_epu16 (
4332 pix_multiply_1x128 (
4333 xmm_alpha, unpack_32_1x128 (m)),
4334 unpack_32_1x128 (d)));
4335 w--;
4336 }
4337
4338 while (w >= 16)
4339 {
4340 xmm_mask = load_128_unaligned ((__m128i*)mask);
4341 xmm_dst = load_128_aligned ((__m128i*)dst);
4342
4343 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4344 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4345
4346 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4347 &xmm_mask_lo, &xmm_mask_hi,
4348 &xmm_mask_lo, &xmm_mask_hi);
4349
4350 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4351 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4352
4353 save_128_aligned (
4354 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4355
4356 mask += 16;
4357 dst += 16;
4358 w -= 16;
4359 }
4360
4361 while (w)
4362 {
4363 m = (uint32_t) *mask++;
4364 d = (uint32_t) *dst;
4365
4366 *dst++ = (uint8_t) pack_1x128_32 (
4367 _mm_adds_epu16 (
4368 pix_multiply_1x128 (
4369 xmm_alpha, unpack_32_1x128 (m)),
4370 unpack_32_1x128 (d)));
4371
4372 w--;
4373 }
4374 }
4375
4376 }
4377
4378 static void
4379 sse2_composite_add_n_8 (pixman_implementation_t *imp,
4380 pixman_composite_info_t *info)
4381 {
4382 PIXMAN_COMPOSITE_ARGS (info);
4383 uint8_t *dst_line, *dst;
4384 int dst_stride;
4385 int32_t w;
4386 uint32_t src;
4387
4388 __m128i xmm_src;
4389
4390 PIXMAN_IMAGE_GET_LINE (
4391 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4392
4393 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4394
4395 src >>= 24;
4396
4397 if (src == 0x00)
4398 return;
4399
4400 if (src == 0xff)
4401 {
4402 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4403 8, dest_x, dest_y, width, height, 0xff);
4404
4405 return;
4406 }
4407
4408 src = (src << 24) | (src << 16) | (src << 8) | src;
4409 xmm_src = _mm_set_epi32 (src, src, src, src);
4410
4411 while (height--)
4412 {
4413 dst = dst_line;
4414 dst_line += dst_stride;
4415 w = width;
4416
4417 while (w && ((uintptr_t)dst & 15))
4418 {
4419 *dst = (uint8_t)_mm_cvtsi128_si32 (
4420 _mm_adds_epu8 (
4421 xmm_src,
4422 _mm_cvtsi32_si128 (*dst)));
4423
4424 w--;
4425 dst++;
4426 }
4427
4428 while (w >= 16)
4429 {
4430 save_128_aligned (
4431 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4432
4433 dst += 16;
4434 w -= 16;
4435 }
4436
4437 while (w)
4438 {
4439 *dst = (uint8_t)_mm_cvtsi128_si32 (
4440 _mm_adds_epu8 (
4441 xmm_src,
4442 _mm_cvtsi32_si128 (*dst)));
4443
4444 w--;
4445 dst++;
4446 }
4447 }
4448
4449 }
4450
4451 static void
4452 sse2_composite_add_8_8 (pixman_implementation_t *imp,
4453 pixman_composite_info_t *info)
4454 {
4455 PIXMAN_COMPOSITE_ARGS (info);
4456 uint8_t *dst_line, *dst;
4457 uint8_t *src_line, *src;
4458 int dst_stride, src_stride;
4459 int32_t w;
4460 uint16_t t;
4461
4462 PIXMAN_IMAGE_GET_LINE (
4463 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4464 PIXMAN_IMAGE_GET_LINE (
4465 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4466
4467 while (height--)
4468 {
4469 dst = dst_line;
4470 src = src_line;
4471
4472 dst_line += dst_stride;
4473 src_line += src_stride;
4474 w = width;
4475
4476 /* Small head */
4477 while (w && (uintptr_t)dst & 3)
4478 {
4479 t = (*dst) + (*src++);
4480 *dst++ = t | (0 - (t >> 8));
4481 w--;
4482 }
4483
4484 sse2_combine_add_u (imp, op,
4485 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4486
4487 /* Small tail */
4488 dst += w & 0xfffc;
4489 src += w & 0xfffc;
4490
4491 w &= 3;
4492
4493 while (w)
4494 {
4495 t = (*dst) + (*src++);
4496 *dst++ = t | (0 - (t >> 8));
4497 w--;
4498 }
4499 }
4500
4501 }
4502
4503 static void
4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4505 pixman_composite_info_t *info)
4506 {
4507 PIXMAN_COMPOSITE_ARGS (info);
4508 uint32_t *dst_line, *dst;
4509 uint32_t *src_line, *src;
4510 int dst_stride, src_stride;
4511
4512 PIXMAN_IMAGE_GET_LINE (
4513 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4514 PIXMAN_IMAGE_GET_LINE (
4515 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4516
4517 while (height--)
4518 {
4519 dst = dst_line;
4520 dst_line += dst_stride;
4521 src = src_line;
4522 src_line += src_stride;
4523
4524 sse2_combine_add_u (imp, op, dst, src, NULL, width);
4525 }
4526 }
4527
4528 static void
4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4530 pixman_composite_info_t *info)
4531 {
4532 PIXMAN_COMPOSITE_ARGS (info);
4533 uint32_t *dst_line, *dst, src;
4534 int dst_stride;
4535
4536 __m128i xmm_src;
4537
4538 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4539
4540 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4541 if (src == 0)
4542 return;
4543
4544 if (src == ~0)
4545 {
4546 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4547 dest_x, dest_y, width, height, ~0);
4548
4549 return;
4550 }
4551
4552 xmm_src = _mm_set_epi32 (src, src, src, src);
4553 while (height--)
4554 {
4555 int w = width;
4556 uint32_t d;
4557
4558 dst = dst_line;
4559 dst_line += dst_stride;
4560
4561 while (w && (unsigned long)dst & 15)
4562 {
4563 d = *dst;
4564 *dst++ =
4565 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4566 w--;
4567 }
4568
4569 while (w >= 4)
4570 {
4571 save_128_aligned
4572 ((__m128i*)dst,
4573 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4574
4575 dst += 4;
4576 w -= 4;
4577 }
4578
4579 while (w--)
4580 {
4581 d = *dst;
4582 *dst++ =
4583 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4584 _mm_cvtsi32_si128 (d)));
4585 }
4586 }
4587 }
4588
4589 static void
4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4591 pixman_composite_info_t *info)
4592 {
4593 PIXMAN_COMPOSITE_ARGS (info);
4594 uint32_t *dst_line, *dst;
4595 uint8_t *mask_line, *mask;
4596 int dst_stride, mask_stride;
4597 int32_t w;
4598 uint32_t src;
4599
4600 __m128i xmm_src;
4601
4602 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4603 if (src == 0)
4604 return;
4605 xmm_src = expand_pixel_32_1x128 (src);
4606
4607 PIXMAN_IMAGE_GET_LINE (
4608 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4609 PIXMAN_IMAGE_GET_LINE (
4610 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4611
4612 while (height--)
4613 {
4614 dst = dst_line;
4615 dst_line += dst_stride;
4616 mask = mask_line;
4617 mask_line += mask_stride;
4618 w = width;
4619
4620 while (w && ((unsigned long)dst & 15))
4621 {
4622 uint8_t m = *mask++;
4623 if (m)
4624 {
4625 *dst = pack_1x128_32
4626 (_mm_adds_epu16
4627 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4628 unpack_32_1x128 (*dst)));
4629 }
4630 dst++;
4631 w--;
4632 }
4633
4634 while (w >= 4)
4635 {
4636 uint32_t m = *(uint32_t*)mask;
4637 if (m)
4638 {
4639 __m128i xmm_mask_lo, xmm_mask_hi;
4640 __m128i xmm_dst_lo, xmm_dst_hi;
4641
4642 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4643 __m128i xmm_mask =
4644 _mm_unpacklo_epi8 (unpack_32_1x128(m),
4645 _mm_setzero_si128 ());
4646
4647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4648 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4649
4650 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4651 &xmm_mask_lo, &xmm_mask_hi);
4652
4653 pix_multiply_2x128 (&xmm_src, &xmm_src,
4654 &xmm_mask_lo, &xmm_mask_hi,
4655 &xmm_mask_lo, &xmm_mask_hi);
4656
4657 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4658 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4659
4660 save_128_aligned (
4661 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4662 }
4663
4664 w -= 4;
4665 dst += 4;
4666 mask += 4;
4667 }
4668
4669 while (w)
4670 {
4671 uint8_t m = *mask++;
4672 if (m)
4673 {
4674 *dst = pack_1x128_32
4675 (_mm_adds_epu16
4676 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4677 unpack_32_1x128 (*dst)));
4678 }
4679 dst++;
4680 w--;
4681 }
4682 }
4683 }
4684
4685 static pixman_bool_t
4686 sse2_blt (pixman_implementation_t *imp,
4687 uint32_t * src_bits,
4688 uint32_t * dst_bits,
4689 int src_stride,
4690 int dst_stride,
4691 int src_bpp,
4692 int dst_bpp,
4693 int src_x,
4694 int src_y,
4695 int dest_x,
4696 int dest_y,
4697 int width,
4698 int height)
4699 {
4700 uint8_t * src_bytes;
4701 uint8_t * dst_bytes;
4702 int byte_width;
4703
4704 if (src_bpp != dst_bpp)
4705 return FALSE;
4706
4707 if (src_bpp == 16)
4708 {
4709 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4711 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4712 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4713 byte_width = 2 * width;
4714 src_stride *= 2;
4715 dst_stride *= 2;
4716 }
4717 else if (src_bpp == 32)
4718 {
4719 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4720 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4721 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4722 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4723 byte_width = 4 * width;
4724 src_stride *= 4;
4725 dst_stride *= 4;
4726 }
4727 else
4728 {
4729 return FALSE;
4730 }
4731
4732 while (height--)
4733 {
4734 int w;
4735 uint8_t *s = src_bytes;
4736 uint8_t *d = dst_bytes;
4737 src_bytes += src_stride;
4738 dst_bytes += dst_stride;
4739 w = byte_width;
4740
4741 while (w >= 2 && ((uintptr_t)d & 3))
4742 {
4743 *(uint16_t *)d = *(uint16_t *)s;
4744 w -= 2;
4745 s += 2;
4746 d += 2;
4747 }
4748
4749 while (w >= 4 && ((uintptr_t)d & 15))
4750 {
4751 *(uint32_t *)d = *(uint32_t *)s;
4752
4753 w -= 4;
4754 s += 4;
4755 d += 4;
4756 }
4757
4758 while (w >= 64)
4759 {
4760 __m128i xmm0, xmm1, xmm2, xmm3;
4761
4762 xmm0 = load_128_unaligned ((__m128i*)(s));
4763 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4764 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4765 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4766
4767 save_128_aligned ((__m128i*)(d), xmm0);
4768 save_128_aligned ((__m128i*)(d + 16), xmm1);
4769 save_128_aligned ((__m128i*)(d + 32), xmm2);
4770 save_128_aligned ((__m128i*)(d + 48), xmm3);
4771
4772 s += 64;
4773 d += 64;
4774 w -= 64;
4775 }
4776
4777 while (w >= 16)
4778 {
4779 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4780
4781 w -= 16;
4782 d += 16;
4783 s += 16;
4784 }
4785
4786 while (w >= 4)
4787 {
4788 *(uint32_t *)d = *(uint32_t *)s;
4789
4790 w -= 4;
4791 s += 4;
4792 d += 4;
4793 }
4794
4795 if (w >= 2)
4796 {
4797 *(uint16_t *)d = *(uint16_t *)s;
4798 w -= 2;
4799 s += 2;
4800 d += 2;
4801 }
4802 }
4803
4804 return TRUE;
4805 }
4806
4807 static void
4808 sse2_composite_copy_area (pixman_implementation_t *imp,
4809 pixman_composite_info_t *info)
4810 {
4811 PIXMAN_COMPOSITE_ARGS (info);
4812 sse2_blt (imp, src_image->bits.bits,
4813 dest_image->bits.bits,
4814 src_image->bits.rowstride,
4815 dest_image->bits.rowstride,
4816 PIXMAN_FORMAT_BPP (src_image->bits.format),
4817 PIXMAN_FORMAT_BPP (dest_image->bits.format),
4818 src_x, src_y, dest_x, dest_y, width, height);
4819 }
4820
4821 static void
4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4823 pixman_composite_info_t *info)
4824 {
4825 PIXMAN_COMPOSITE_ARGS (info);
4826 uint32_t *src, *src_line, s;
4827 uint32_t *dst, *dst_line, d;
4828 uint8_t *mask, *mask_line;
4829 uint32_t m;
4830 int src_stride, mask_stride, dst_stride;
4831 int32_t w;
4832 __m128i ms;
4833
4834 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4835 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4836 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4837
4838 PIXMAN_IMAGE_GET_LINE (
4839 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4840 PIXMAN_IMAGE_GET_LINE (
4841 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4842 PIXMAN_IMAGE_GET_LINE (
4843 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4844
4845 while (height--)
4846 {
4847 src = src_line;
4848 src_line += src_stride;
4849 dst = dst_line;
4850 dst_line += dst_stride;
4851 mask = mask_line;
4852 mask_line += mask_stride;
4853
4854 w = width;
4855
4856 while (w && (uintptr_t)dst & 15)
4857 {
4858 s = 0xff000000 | *src++;
4859 m = (uint32_t) *mask++;
4860 d = *dst;
4861 ms = unpack_32_1x128 (s);
4862
4863 if (m != 0xff)
4864 {
4865 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4866 __m128i md = unpack_32_1x128 (d);
4867
4868 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4869 }
4870
4871 *dst++ = pack_1x128_32 (ms);
4872 w--;
4873 }
4874
4875 while (w >= 4)
4876 {
4877 m = *(uint32_t*) mask;
4878 xmm_src = _mm_or_si128 (
4879 load_128_unaligned ((__m128i*)src), mask_ff000000);
4880
4881 if (m == 0xffffffff)
4882 {
4883 save_128_aligned ((__m128i*)dst, xmm_src);
4884 }
4885 else
4886 {
4887 xmm_dst = load_128_aligned ((__m128i*)dst);
4888
4889 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4890
4891 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4892 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4893 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4894
4895 expand_alpha_rev_2x128 (
4896 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4897
4898 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4899 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4900 &xmm_dst_lo, &xmm_dst_hi);
4901
4902 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4903 }
4904
4905 src += 4;
4906 dst += 4;
4907 mask += 4;
4908 w -= 4;
4909 }
4910
4911 while (w)
4912 {
4913 m = (uint32_t) *mask++;
4914
4915 if (m)
4916 {
4917 s = 0xff000000 | *src;
4918
4919 if (m == 0xff)
4920 {
4921 *dst = s;
4922 }
4923 else
4924 {
4925 __m128i ma, md, ms;
4926
4927 d = *dst;
4928
4929 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4930 md = unpack_32_1x128 (d);
4931 ms = unpack_32_1x128 (s);
4932
4933 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4934 }
4935
4936 }
4937
4938 src++;
4939 dst++;
4940 w--;
4941 }
4942 }
4943
4944 }
4945
4946 static void
4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4948 pixman_composite_info_t *info)
4949 {
4950 PIXMAN_COMPOSITE_ARGS (info);
4951 uint32_t *src, *src_line, s;
4952 uint32_t *dst, *dst_line, d;
4953 uint8_t *mask, *mask_line;
4954 uint32_t m;
4955 int src_stride, mask_stride, dst_stride;
4956 int32_t w;
4957
4958 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4959 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4960 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4961
4962 PIXMAN_IMAGE_GET_LINE (
4963 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4964 PIXMAN_IMAGE_GET_LINE (
4965 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4966 PIXMAN_IMAGE_GET_LINE (
4967 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4968
4969 while (height--)
4970 {
4971 src = src_line;
4972 src_line += src_stride;
4973 dst = dst_line;
4974 dst_line += dst_stride;
4975 mask = mask_line;
4976 mask_line += mask_stride;
4977
4978 w = width;
4979
4980 while (w && (uintptr_t)dst & 15)
4981 {
4982 uint32_t sa;
4983
4984 s = *src++;
4985 m = (uint32_t) *mask++;
4986 d = *dst;
4987
4988 sa = s >> 24;
4989
4990 if (m)
4991 {
4992 if (sa == 0xff && m == 0xff)
4993 {
4994 *dst = s;
4995 }
4996 else
4997 {
4998 __m128i ms, md, ma, msa;
4999
5000 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5001 ms = unpack_32_1x128 (s);
5002 md = unpack_32_1x128 (d);
5003
5004 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5005
5006 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5007 }
5008 }
5009
5010 dst++;
5011 w--;
5012 }
5013
5014 while (w >= 4)
5015 {
5016 m = *(uint32_t *) mask;
5017
5018 if (m)
5019 {
5020 xmm_src = load_128_unaligned ((__m128i*)src);
5021
5022 if (m == 0xffffffff && is_opaque (xmm_src))
5023 {
5024 save_128_aligned ((__m128i *)dst, xmm_src);
5025 }
5026 else
5027 {
5028 xmm_dst = load_128_aligned ((__m128i *)dst);
5029
5030 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5031
5032 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5033 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5034 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5035
5036 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5037 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5038
5039 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5040 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5041
5042 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5043 }
5044 }
5045
5046 src += 4;
5047 dst += 4;
5048 mask += 4;
5049 w -= 4;
5050 }
5051
5052 while (w)
5053 {
5054 uint32_t sa;
5055
5056 s = *src++;
5057 m = (uint32_t) *mask++;
5058 d = *dst;
5059
5060 sa = s >> 24;
5061
5062 if (m)
5063 {
5064 if (sa == 0xff && m == 0xff)
5065 {
5066 *dst = s;
5067 }
5068 else
5069 {
5070 __m128i ms, md, ma, msa;
5071
5072 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5073 ms = unpack_32_1x128 (s);
5074 md = unpack_32_1x128 (d);
5075
5076 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5077
5078 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5079 }
5080 }
5081
5082 dst++;
5083 w--;
5084 }
5085 }
5086
5087 }
5088
5089 static void
5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5091 pixman_composite_info_t *info)
5092 {
5093 PIXMAN_COMPOSITE_ARGS (info);
5094 uint32_t src;
5095 uint32_t *dst_line, *dst;
5096 __m128i xmm_src;
5097 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5098 __m128i xmm_dsta_hi, xmm_dsta_lo;
5099 int dst_stride;
5100 int32_t w;
5101
5102 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5103
5104 if (src == 0)
5105 return;
5106
5107 PIXMAN_IMAGE_GET_LINE (
5108 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5109
5110 xmm_src = expand_pixel_32_1x128 (src);
5111
5112 while (height--)
5113 {
5114 dst = dst_line;
5115
5116 dst_line += dst_stride;
5117 w = width;
5118
5119 while (w && (uintptr_t)dst & 15)
5120 {
5121 __m128i vd;
5122
5123 vd = unpack_32_1x128 (*dst);
5124
5125 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5126 xmm_src));
5127 w--;
5128 dst++;
5129 }
5130
5131 while (w >= 4)
5132 {
5133 __m128i tmp_lo, tmp_hi;
5134
5135 xmm_dst = load_128_aligned ((__m128i*)dst);
5136
5137 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5139
5140 tmp_lo = xmm_src;
5141 tmp_hi = xmm_src;
5142
5143 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5144 &xmm_dsta_lo, &xmm_dsta_hi,
5145 &tmp_lo, &tmp_hi);
5146
5147 save_128_aligned (
5148 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5149
5150 w -= 4;
5151 dst += 4;
5152 }
5153
5154 while (w)
5155 {
5156 __m128i vd;
5157
5158 vd = unpack_32_1x128 (*dst);
5159
5160 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5161 xmm_src));
5162 w--;
5163 dst++;
5164 }
5165
5166 }
5167
5168 }
5169
5170 static void
5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5172 pixman_composite_info_t *info)
5173 {
5174 PIXMAN_COMPOSITE_ARGS (info);
5175 uint32_t *src, *src_line, s;
5176 uint32_t *dst, *dst_line, d;
5177 uint32_t *mask, *mask_line;
5178 uint32_t m;
5179 int src_stride, mask_stride, dst_stride;
5180 int32_t w;
5181
5182 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5183 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5184 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5185
5186 PIXMAN_IMAGE_GET_LINE (
5187 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5188 PIXMAN_IMAGE_GET_LINE (
5189 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5190 PIXMAN_IMAGE_GET_LINE (
5191 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5192
5193 while (height--)
5194 {
5195 src = src_line;
5196 src_line += src_stride;
5197 dst = dst_line;
5198 dst_line += dst_stride;
5199 mask = mask_line;
5200 mask_line += mask_stride;
5201
5202 w = width;
5203
5204 while (w && (uintptr_t)dst & 15)
5205 {
5206 uint32_t sa;
5207
5208 s = *src++;
5209 m = (*mask++) >> 24;
5210 d = *dst;
5211
5212 sa = s >> 24;
5213
5214 if (m)
5215 {
5216 if (sa == 0xff && m == 0xff)
5217 {
5218 *dst = s;
5219 }
5220 else
5221 {
5222 __m128i ms, md, ma, msa;
5223
5224 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5225 ms = unpack_32_1x128 (s);
5226 md = unpack_32_1x128 (d);
5227
5228 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5229
5230 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5231 }
5232 }
5233
5234 dst++;
5235 w--;
5236 }
5237
5238 while (w >= 4)
5239 {
5240 xmm_mask = load_128_unaligned ((__m128i*)mask);
5241
5242 if (!is_transparent (xmm_mask))
5243 {
5244 xmm_src = load_128_unaligned ((__m128i*)src);
5245
5246 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5247 {
5248 save_128_aligned ((__m128i *)dst, xmm_src);
5249 }
5250 else
5251 {
5252 xmm_dst = load_128_aligned ((__m128i *)dst);
5253
5254 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5255 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5256 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5257
5258 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5259 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5260
5261 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5262 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5263
5264 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5265 }
5266 }
5267
5268 src += 4;
5269 dst += 4;
5270 mask += 4;
5271 w -= 4;
5272 }
5273
5274 while (w)
5275 {
5276 uint32_t sa;
5277
5278 s = *src++;
5279 m = (*mask++) >> 24;
5280 d = *dst;
5281
5282 sa = s >> 24;
5283
5284 if (m)
5285 {
5286 if (sa == 0xff && m == 0xff)
5287 {
5288 *dst = s;
5289 }
5290 else
5291 {
5292 __m128i ms, md, ma, msa;
5293
5294 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5295 ms = unpack_32_1x128 (s);
5296 md = unpack_32_1x128 (d);
5297
5298 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5299
5300 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5301 }
5302 }
5303
5304 dst++;
5305 w--;
5306 }
5307 }
5308
5309 }
5310
5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */
5312 static force_inline void
5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
5314 const uint32_t* ps,
5315 int32_t w,
5316 pixman_fixed_t vx,
5317 pixman_fixed_t unit_x,
5318 pixman_fixed_t src_width_fixed,
5319 pixman_bool_t fully_transparent_src)
5320 {
5321 uint32_t s, d;
5322 const uint32_t* pm = NULL;
5323
5324 __m128i xmm_dst_lo, xmm_dst_hi;
5325 __m128i xmm_src_lo, xmm_src_hi;
5326 __m128i xmm_alpha_lo, xmm_alpha_hi;
5327
5328 if (fully_transparent_src)
5329 return;
5330
5331 /* Align dst on a 16-byte boundary */
5332 while (w && ((uintptr_t)pd & 15))
5333 {
5334 d = *pd;
5335 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5336 vx += unit_x;
5337 while (vx >= 0)
5338 vx -= src_width_fixed;
5339
5340 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5341 if (pm)
5342 pm++;
5343 w--;
5344 }
5345
5346 while (w >= 4)
5347 {
5348 __m128i tmp;
5349 uint32_t tmp1, tmp2, tmp3, tmp4;
5350
5351 tmp1 = *(ps + pixman_fixed_to_int (vx));
5352 vx += unit_x;
5353 while (vx >= 0)
5354 vx -= src_width_fixed;
5355 tmp2 = *(ps + pixman_fixed_to_int (vx));
5356 vx += unit_x;
5357 while (vx >= 0)
5358 vx -= src_width_fixed;
5359 tmp3 = *(ps + pixman_fixed_to_int (vx));
5360 vx += unit_x;
5361 while (vx >= 0)
5362 vx -= src_width_fixed;
5363 tmp4 = *(ps + pixman_fixed_to_int (vx));
5364 vx += unit_x;
5365 while (vx >= 0)
5366 vx -= src_width_fixed;
5367
5368 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5369
5370 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5371
5372 if (is_opaque (xmm_src_hi))
5373 {
5374 save_128_aligned ((__m128i*)pd, xmm_src_hi);
5375 }
5376 else if (!is_zero (xmm_src_hi))
5377 {
5378 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5379
5380 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5381 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5382
5383 expand_alpha_2x128 (
5384 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5385
5386 over_2x128 (&xmm_src_lo, &xmm_src_hi,
5387 &xmm_alpha_lo, &xmm_alpha_hi,
5388 &xmm_dst_lo, &xmm_dst_hi);
5389
5390 /* rebuid the 4 pixel data and save*/
5391 save_128_aligned ((__m128i*)pd,
5392 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5393 }
5394
5395 w -= 4;
5396 pd += 4;
5397 if (pm)
5398 pm += 4;
5399 }
5400
5401 while (w)
5402 {
5403 d = *pd;
5404 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5405 vx += unit_x;
5406 while (vx >= 0)
5407 vx -= src_width_fixed;
5408
5409 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
5410 if (pm)
5411 pm++;
5412
5413 w--;
5414 }
5415 }
5416
5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5418 scaled_nearest_scanline_sse2_8888_8888_OVER,
5419 uint32_t, uint32_t, COVER)
5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5421 scaled_nearest_scanline_sse2_8888_8888_OVER,
5422 uint32_t, uint32_t, NONE)
5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5424 scaled_nearest_scanline_sse2_8888_8888_OVER,
5425 uint32_t, uint32_t, PAD)
5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5427 scaled_nearest_scanline_sse2_8888_8888_OVER,
5428 uint32_t, uint32_t, NORMAL)
5429
5430 static force_inline void
5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5432 uint32_t * dst,
5433 const uint32_t * src,
5434 int32_t w,
5435 pixman_fixed_t vx,
5436 pixman_fixed_t unit_x,
5437 pixman_fixed_t src_width_fixed,
5438 pixman_bool_t zero_src)
5439 {
5440 __m128i xmm_mask;
5441 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5442 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5443 __m128i xmm_alpha_lo, xmm_alpha_hi;
5444
5445 if (zero_src || (*mask >> 24) == 0)
5446 return;
5447
5448 xmm_mask = create_mask_16_128 (*mask >> 24);
5449
5450 while (w && (uintptr_t)dst & 15)
5451 {
5452 uint32_t s = *(src + pixman_fixed_to_int (vx));
5453 vx += unit_x;
5454 while (vx >= 0)
5455 vx -= src_width_fixed;
5456
5457 if (s)
5458 {
5459 uint32_t d = *dst;
5460
5461 __m128i ms = unpack_32_1x128 (s);
5462 __m128i alpha = expand_alpha_1x128 (ms);
5463 __m128i dest = xmm_mask;
5464 __m128i alpha_dst = unpack_32_1x128 (d);
5465
5466 *dst = pack_1x128_32 (
5467 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5468 }
5469 dst++;
5470 w--;
5471 }
5472
5473 while (w >= 4)
5474 {
5475 uint32_t tmp1, tmp2, tmp3, tmp4;
5476
5477 tmp1 = *(src + pixman_fixed_to_int (vx));
5478 vx += unit_x;
5479 while (vx >= 0)
5480 vx -= src_width_fixed;
5481 tmp2 = *(src + pixman_fixed_to_int (vx));
5482 vx += unit_x;
5483 while (vx >= 0)
5484 vx -= src_width_fixed;
5485 tmp3 = *(src + pixman_fixed_to_int (vx));
5486 vx += unit_x;
5487 while (vx >= 0)
5488 vx -= src_width_fixed;
5489 tmp4 = *(src + pixman_fixed_to_int (vx));
5490 vx += unit_x;
5491 while (vx >= 0)
5492 vx -= src_width_fixed;
5493
5494 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5495
5496 if (!is_zero (xmm_src))
5497 {
5498 xmm_dst = load_128_aligned ((__m128i*)dst);
5499
5500 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5501 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5502 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5503 &xmm_alpha_lo, &xmm_alpha_hi);
5504
5505 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5506 &xmm_alpha_lo, &xmm_alpha_hi,
5507 &xmm_mask, &xmm_mask,
5508 &xmm_dst_lo, &xmm_dst_hi);
5509
5510 save_128_aligned (
5511 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5512 }
5513
5514 dst += 4;
5515 w -= 4;
5516 }
5517
5518 while (w)
5519 {
5520 uint32_t s = *(src + pixman_fixed_to_int (vx));
5521 vx += unit_x;
5522 while (vx >= 0)
5523 vx -= src_width_fixed;
5524
5525 if (s)
5526 {
5527 uint32_t d = *dst;
5528
5529 __m128i ms = unpack_32_1x128 (s);
5530 __m128i alpha = expand_alpha_1x128 (ms);
5531 __m128i mask = xmm_mask;
5532 __m128i dest = unpack_32_1x128 (d);
5533
5534 *dst = pack_1x128_32 (
5535 in_over_1x128 (&ms, &alpha, &mask, &dest));
5536 }
5537
5538 dst++;
5539 w--;
5540 }
5541
5542 }
5543
5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5545 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5546 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5556
5557 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
5558
5559 #define BILINEAR_DECLARE_VARIABLES \
5560 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
5561 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
5562 const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
5563 const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
5564 const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
5565 const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
5566 const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
5567 unit_x, unit_x, unit_x, unit_x); \
5568 const __m128i xmm_zero = _mm_setzero_si128 (); \
5569 __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
5570
5571 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
5572 do { \
5573 __m128i xmm_wh, xmm_lo, xmm_hi, a; \
5574 /* fetch 2x2 pixel block into sse2 registers */ \
5575 __m128i tltr = _mm_loadl_epi64 ( \
5576 (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
5577 __m128i blbr = _mm_loadl_epi64 ( \
5578 (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \
5579 vx += unit_x; \
5580 /* vertical interpolation */ \
5581 a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \
5582 xmm_wt), \
5583 _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
5584 xmm_wb)); \
5585 if (BILINEAR_INTERPOLATION_BITS < 8) \
5586 { \
5587 /* calculate horizontal weights */ \
5588 xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \
5589 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
5590 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5591 /* horizontal interpolation */ \
5592 a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
5593 a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
5594 } \
5595 else \
5596 { \
5597 /* calculate horizontal weights */ \
5598 xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \
5599 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
5600 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5601 /* horizontal interpolation */ \
5602 xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
5603 xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
5604 a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
5605 _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
5606 } \
5607 /* shift and pack the result */ \
5608 a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
5609 a = _mm_packs_epi32 (a, a); \
5610 a = _mm_packus_epi16 (a, a); \
5611 pix = _mm_cvtsi128_si32 (a); \
5612 } while (0)
5613
5614 #define BILINEAR_SKIP_ONE_PIXEL() \
5615 do { \
5616 vx += unit_x; \
5617 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
5618 } while(0)
5619
5620 static force_inline void
5621 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
5622 const uint32_t * mask,
5623 const uint32_t * src_top,
5624 const uint32_t * src_bottom,
5625 int32_t w,
5626 int wt,
5627 int wb,
5628 pixman_fixed_t vx,
5629 pixman_fixed_t unit_x,
5630 pixman_fixed_t max_vx,
5631 pixman_bool_t zero_src)
5632 {
5633 BILINEAR_DECLARE_VARIABLES;
5634 uint32_t pix1, pix2, pix3, pix4;
5635
5636 while ((w -= 4) >= 0)
5637 {
5638 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5639 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5640 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5641 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5642 *dst++ = pix1;
5643 *dst++ = pix2;
5644 *dst++ = pix3;
5645 *dst++ = pix4;
5646 }
5647
5648 if (w & 2)
5649 {
5650 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5651 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5652 *dst++ = pix1;
5653 *dst++ = pix2;
5654 }
5655
5656 if (w & 1)
5657 {
5658 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5659 *dst = pix1;
5660 }
5661
5662 }
5663
5664 /* Add extra NULL argument to the existing bilinear fast paths to indicate
5665 * that we don't need two-pass processing */
5666
5667 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5668 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5669 uint32_t, uint32_t, uint32_t,
5670 COVER, FLAG_NONE)
5671 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5672 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5673 uint32_t, uint32_t, uint32_t,
5674 PAD, FLAG_NONE)
5675 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5676 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5677 uint32_t, uint32_t, uint32_t,
5678 NONE, FLAG_NONE)
5679 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5680 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
5681 uint32_t, uint32_t, uint32_t,
5682 NORMAL, FLAG_NONE)
5683
5684 static force_inline void
5685 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
5686 const uint32_t * mask,
5687 const uint32_t * src_top,
5688 const uint32_t * src_bottom,
5689 int32_t w,
5690 int wt,
5691 int wb,
5692 pixman_fixed_t vx,
5693 pixman_fixed_t unit_x,
5694 pixman_fixed_t max_vx,
5695 pixman_bool_t zero_src)
5696 {
5697 BILINEAR_DECLARE_VARIABLES;
5698 uint32_t pix1, pix2, pix3, pix4;
5699
5700 while (w && ((uintptr_t)dst & 15))
5701 {
5702 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5703
5704 if (pix1)
5705 {
5706 pix2 = *dst;
5707 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5708 }
5709
5710 w--;
5711 dst++;
5712 }
5713
5714 while (w >= 4)
5715 {
5716 __m128i xmm_src;
5717 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5718 __m128i xmm_alpha_hi, xmm_alpha_lo;
5719
5720 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5721 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5722 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5724
5725 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5726
5727 if (!is_zero (xmm_src))
5728 {
5729 if (is_opaque (xmm_src))
5730 {
5731 save_128_aligned ((__m128i *)dst, xmm_src);
5732 }
5733 else
5734 {
5735 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5736
5737 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5738 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5739
5740 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5741 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5742 &xmm_dst_lo, &xmm_dst_hi);
5743
5744 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5745 }
5746 }
5747
5748 w -= 4;
5749 dst += 4;
5750 }
5751
5752 while (w)
5753 {
5754 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5755
5756 if (pix1)
5757 {
5758 pix2 = *dst;
5759 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5760 }
5761
5762 w--;
5763 dst++;
5764 }
5765 }
5766
5767 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5768 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5769 uint32_t, uint32_t, uint32_t,
5770 COVER, FLAG_NONE)
5771 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5772 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5773 uint32_t, uint32_t, uint32_t,
5774 PAD, FLAG_NONE)
5775 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5776 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5777 uint32_t, uint32_t, uint32_t,
5778 NONE, FLAG_NONE)
5779 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5780 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
5781 uint32_t, uint32_t, uint32_t,
5782 NORMAL, FLAG_NONE)
5783
5784
5785 /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
5786 as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
5787
5788 void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
5789 {
5790 /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
5791 while (--width >= 0)
5792 {
5793 *dst = composite_over_8888_0565pixel (*src, *dst);
5794 src++;
5795 dst++;
5796 }
5797 }
5798
5799 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
5800 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5801 uint32_t, uint32_t, uint16_t,
5802 COVER, FLAG_NONE)
5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
5804 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5805 uint32_t, uint32_t, uint16_t,
5806 PAD, FLAG_NONE)
5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
5808 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5809 uint32_t, uint32_t, uint16_t,
5810 NONE, FLAG_NONE)
5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
5812 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
5813 uint32_t, uint32_t, uint16_t,
5814 NORMAL, FLAG_NONE)
5815
5816 /*****************************/
5817
5818 static force_inline void
5819 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
5820 const uint8_t * mask,
5821 const uint32_t * src_top,
5822 const uint32_t * src_bottom,
5823 int32_t w,
5824 int wt,
5825 int wb,
5826 pixman_fixed_t vx,
5827 pixman_fixed_t unit_x,
5828 pixman_fixed_t max_vx,
5829 pixman_bool_t zero_src)
5830 {
5831 BILINEAR_DECLARE_VARIABLES;
5832 uint32_t pix1, pix2, pix3, pix4;
5833 uint32_t m;
5834
5835 while (w && ((uintptr_t)dst & 15))
5836 {
5837 uint32_t sa;
5838
5839 m = (uint32_t) *mask++;
5840
5841 if (m)
5842 {
5843 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5844 sa = pix1 >> 24;
5845
5846 if (sa == 0xff && m == 0xff)
5847 {
5848 *dst = pix1;
5849 }
5850 else
5851 {
5852 __m128i ms, md, ma, msa;
5853
5854 pix2 = *dst;
5855 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5856 ms = unpack_32_1x128 (pix1);
5857 md = unpack_32_1x128 (pix2);
5858
5859 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5860
5861 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5862 }
5863 }
5864 else
5865 {
5866 BILINEAR_SKIP_ONE_PIXEL ();
5867 }
5868
5869 w--;
5870 dst++;
5871 }
5872
5873 while (w >= 4)
5874 {
5875 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5876 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5877 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5878
5879 m = *(uint32_t*)mask;
5880
5881 if (m)
5882 {
5883 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5884 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5885 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5886 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5887
5888 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5889
5890 if (m == 0xffffffff && is_opaque (xmm_src))
5891 {
5892 save_128_aligned ((__m128i *)dst, xmm_src);
5893 }
5894 else
5895 {
5896 xmm_dst = load_128_aligned ((__m128i *)dst);
5897
5898 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5899
5900 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5901 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5902 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5903
5904 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5905 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5906
5907 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5908 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5909
5910 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5911 }
5912 }
5913 else
5914 {
5915 BILINEAR_SKIP_ONE_PIXEL ();
5916 BILINEAR_SKIP_ONE_PIXEL ();
5917 BILINEAR_SKIP_ONE_PIXEL ();
5918 BILINEAR_SKIP_ONE_PIXEL ();
5919 }
5920
5921 w -= 4;
5922 dst += 4;
5923 mask += 4;
5924 }
5925
5926 while (w)
5927 {
5928 uint32_t sa;
5929
5930 m = (uint32_t) *mask++;
5931
5932 if (m)
5933 {
5934 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5935 sa = pix1 >> 24;
5936
5937 if (sa == 0xff && m == 0xff)
5938 {
5939 *dst = pix1;
5940 }
5941 else
5942 {
5943 __m128i ms, md, ma, msa;
5944
5945 pix2 = *dst;
5946 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5947 ms = unpack_32_1x128 (pix1);
5948 md = unpack_32_1x128 (pix2);
5949
5950 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5951
5952 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5953 }
5954 }
5955 else
5956 {
5957 BILINEAR_SKIP_ONE_PIXEL ();
5958 }
5959
5960 w--;
5961 dst++;
5962 }
5963 }
5964
5965 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
5966 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
5967 uint32_t, uint8_t, uint32_t,
5968 COVER, FLAG_HAVE_NON_SOLID_MASK)
5969 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
5970 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
5971 uint32_t, uint8_t, uint32_t,
5972 PAD, FLAG_HAVE_NON_SOLID_MASK)
5973 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
5974 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
5975 uint32_t, uint8_t, uint32_t,
5976 NONE, FLAG_HAVE_NON_SOLID_MASK)
5977 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
5978 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
5979 uint32_t, uint8_t, uint32_t,
5980 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
5981
5982 static force_inline void
5983 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
5984 const uint32_t * mask,
5985 const uint32_t * src_top,
5986 const uint32_t * src_bottom,
5987 int32_t w,
5988 int wt,
5989 int wb,
5990 pixman_fixed_t vx,
5991 pixman_fixed_t unit_x,
5992 pixman_fixed_t max_vx,
5993 pixman_bool_t zero_src)
5994 {
5995 BILINEAR_DECLARE_VARIABLES;
5996 uint32_t pix1, pix2, pix3, pix4;
5997 __m128i xmm_mask;
5998
5999 if (zero_src || (*mask >> 24) == 0)
6000 return;
6001
6002 xmm_mask = create_mask_16_128 (*mask >> 24);
6003
6004 while (w && ((uintptr_t)dst & 15))
6005 {
6006 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6007 if (pix1)
6008 {
6009 uint32_t d = *dst;
6010
6011 __m128i ms = unpack_32_1x128 (pix1);
6012 __m128i alpha = expand_alpha_1x128 (ms);
6013 __m128i dest = xmm_mask;
6014 __m128i alpha_dst = unpack_32_1x128 (d);
6015
6016 *dst = pack_1x128_32
6017 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6018 }
6019
6020 dst++;
6021 w--;
6022 }
6023
6024 while (w >= 4)
6025 {
6026 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6027 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
6028 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
6029 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
6030
6031 if (pix1 | pix2 | pix3 | pix4)
6032 {
6033 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
6034 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6035 __m128i xmm_alpha_lo, xmm_alpha_hi;
6036
6037 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
6038
6039 xmm_dst = load_128_aligned ((__m128i*)dst);
6040
6041 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6042 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6043 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6044 &xmm_alpha_lo, &xmm_alpha_hi);
6045
6046 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6047 &xmm_alpha_lo, &xmm_alpha_hi,
6048 &xmm_mask, &xmm_mask,
6049 &xmm_dst_lo, &xmm_dst_hi);
6050
6051 save_128_aligned
6052 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6053 }
6054
6055 dst += 4;
6056 w -= 4;
6057 }
6058
6059 while (w)
6060 {
6061 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6062 if (pix1)
6063 {
6064 uint32_t d = *dst;
6065
6066 __m128i ms = unpack_32_1x128 (pix1);
6067 __m128i alpha = expand_alpha_1x128 (ms);
6068 __m128i dest = xmm_mask;
6069 __m128i alpha_dst = unpack_32_1x128 (d);
6070
6071 *dst = pack_1x128_32
6072 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6073 }
6074
6075 dst++;
6076 w--;
6077 }
6078 }
6079
6080 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6081 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6082 uint32_t, uint32_t, uint32_t,
6083 COVER, FLAG_HAVE_SOLID_MASK)
6084 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6085 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6086 uint32_t, uint32_t, uint32_t,
6087 PAD, FLAG_HAVE_SOLID_MASK)
6088 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6089 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6090 uint32_t, uint32_t, uint32_t,
6091 NONE, FLAG_HAVE_SOLID_MASK)
6092 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6093 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
6094 uint32_t, uint32_t, uint32_t,
6095 NORMAL, FLAG_HAVE_SOLID_MASK)
6096
6097 static const pixman_fast_path_t sse2_fast_paths[] =
6098 {
6099 /* PIXMAN_OP_OVER */
6100 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6101 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6102 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6103 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6104 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6105 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6106 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6107 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6108 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6109 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6110 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6111 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6112 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6113 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6114 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6115 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6116 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6117 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6118 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6119 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6120 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6121 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6122 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6123 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6124 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6125 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6126 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6127 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6128 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6129 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6130 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6131 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6132 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6133 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6134 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6135 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6136 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6137 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6138 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6139 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6140 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6141 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6142 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6143 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6144 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6145 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6146 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6147
6148 /* PIXMAN_OP_OVER_REVERSE */
6149 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6150 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6151
6152 /* PIXMAN_OP_ADD */
6153 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6154 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6155 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6156 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6157 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6158 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6159 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6160 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6161 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6162 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6163 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6164 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6165 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6166 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6167
6168 /* PIXMAN_OP_SRC */
6169 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6170 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6171 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6172 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6173 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6174 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6175 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6176 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6177 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6178 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6179 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6180 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6181 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6182 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6183 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6184 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6185 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6186 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6187
6188 /* PIXMAN_OP_IN */
6189 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6190 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6191 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6192
6193 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6194 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6195 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6196 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6197 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6198 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6199 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6200 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6201 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6202 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6203 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6204 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6205 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6206 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6207 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6208 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6209
6210 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6211 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6212 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6213 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6214 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6215 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6216 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6217 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6218
6219 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6220 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6221 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6222 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6223 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6224 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6225
6226 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6227 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6228 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6229 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6230
6231 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6232 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6233 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6234 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6235
6236 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6237 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6238 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6239 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6240
6241 /* and here the needed entries are added to the fast path table */
6242
6243 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
6244 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
6245
6246 { PIXMAN_OP_NONE },
6247 };
6248
6249 static uint32_t *
6250 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6251 {
6252 int w = iter->width;
6253 __m128i ff000000 = mask_ff000000;
6254 uint32_t *dst = iter->buffer;
6255 uint32_t *src = (uint32_t *)iter->bits;
6256
6257 iter->bits += iter->stride;
6258
6259 while (w && ((uintptr_t)dst) & 0x0f)
6260 {
6261 *dst++ = (*src++) | 0xff000000;
6262 w--;
6263 }
6264
6265 while (w >= 4)
6266 {
6267 save_128_aligned (
6268 (__m128i *)dst, _mm_or_si128 (
6269 load_128_unaligned ((__m128i *)src), ff000000));
6270
6271 dst += 4;
6272 src += 4;
6273 w -= 4;
6274 }
6275
6276 while (w)
6277 {
6278 *dst++ = (*src++) | 0xff000000;
6279 w--;
6280 }
6281
6282 return iter->buffer;
6283 }
6284
6285 static uint32_t *
6286 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6287 {
6288 int w = iter->width;
6289 uint32_t *dst = iter->buffer;
6290 uint16_t *src = (uint16_t *)iter->bits;
6291 __m128i ff000000 = mask_ff000000;
6292
6293 iter->bits += iter->stride;
6294
6295 while (w && ((uintptr_t)dst) & 0x0f)
6296 {
6297 uint16_t s = *src++;
6298
6299 *dst++ = convert_0565_to_8888 (s);
6300 w--;
6301 }
6302
6303 while (w >= 8)
6304 {
6305 __m128i lo, hi, s;
6306
6307 s = _mm_loadu_si128 ((__m128i *)src);
6308
6309 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6310 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6311
6312 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6313 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6314
6315 dst += 8;
6316 src += 8;
6317 w -= 8;
6318 }
6319
6320 while (w)
6321 {
6322 uint16_t s = *src++;
6323
6324 *dst++ = convert_0565_to_8888 (s);
6325 w--;
6326 }
6327
6328 return iter->buffer;
6329 }
6330
6331 static uint32_t *
6332 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6333 {
6334 int w = iter->width;
6335 uint32_t *dst = iter->buffer;
6336 uint8_t *src = iter->bits;
6337 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6338
6339 iter->bits += iter->stride;
6340
6341 while (w && (((uintptr_t)dst) & 15))
6342 {
6343 *dst++ = *(src++) << 24;
6344 w--;
6345 }
6346
6347 while (w >= 16)
6348 {
6349 xmm0 = _mm_loadu_si128((__m128i *)src);
6350
6351 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
6352 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
6353 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6354 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6355 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6356 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6357
6358 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
6359 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
6360 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
6361 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6362
6363 dst += 16;
6364 src += 16;
6365 w -= 16;
6366 }
6367
6368 while (w)
6369 {
6370 *dst++ = *(src++) << 24;
6371 w--;
6372 }
6373
6374 return iter->buffer;
6375 }
6376
6377 typedef struct
6378 {
6379 pixman_format_code_t format;
6380 pixman_iter_get_scanline_t get_scanline;
6381 } fetcher_info_t;
6382
6383 static const fetcher_info_t fetchers[] =
6384 {
6385 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
6386 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
6387 { PIXMAN_a8, sse2_fetch_a8 },
6388 { PIXMAN_null }
6389 };
6390
6391 static pixman_bool_t
6392 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
6393 {
6394 pixman_image_t *image = iter->image;
6395
6396 #define FLAGS \
6397 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
6398 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6399
6400 if ((iter->iter_flags & ITER_NARROW) &&
6401 (iter->image_flags & FLAGS) == FLAGS)
6402 {
6403 const fetcher_info_t *f;
6404
6405 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6406 {
6407 if (image->common.extended_format_code == f->format)
6408 {
6409 uint8_t *b = (uint8_t *)image->bits.bits;
6410 int s = image->bits.rowstride * 4;
6411
6412 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
6413 iter->stride = s;
6414
6415 iter->get_scanline = f->get_scanline;
6416 return TRUE;
6417 }
6418 }
6419 }
6420
6421 return FALSE;
6422 }
6423
6424 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6425 __attribute__((__force_align_arg_pointer__))
6426 #endif
6427 pixman_implementation_t *
6428 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6429 {
6430 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6431
6432 /* SSE2 constants */
6433 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6434 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6435 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6436 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6437 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6438 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6439 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6440 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6441 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
6442 mask_0080 = create_mask_16_128 (0x0080);
6443 mask_00ff = create_mask_16_128 (0x00ff);
6444 mask_0101 = create_mask_16_128 (0x0101);
6445 mask_ffff = create_mask_16_128 (0xffff);
6446 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6447 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6448 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6449 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6450
6451 /* Set up function pointers */
6452 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6453 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6454 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6455 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6456 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6457 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6458 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6459 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6460 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6461 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6462
6463 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6464
6465 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6466 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6467 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6468 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6469 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6470 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6471 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6472 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6473 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6474 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6475 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6476
6477 imp->blt = sse2_blt;
6478 imp->fill = sse2_fill;
6479
6480 imp->src_iter_init = sse2_src_iter_init;
6481
6482 return imp;
6483 }

mercurial