The Tor Browser: gfx/cairo/libpixman/src/pixman-sse2.c@b8a032363ba2

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*

     2  * Copyright © 2008 Rodrigo Kumpera

     3  * Copyright © 2008 André Tupinambá

     4  *

     5  * Permission to use, copy, modify, distribute, and sell this software and its

     6  * documentation for any purpose is hereby granted without fee, provided that

     7  * the above copyright notice appear in all copies and that both that

     8  * copyright notice and this permission notice appear in supporting

     9  * documentation, and that the name of Red Hat not be used in advertising or

    10  * publicity pertaining to distribution of the software without specific,

    11  * written prior permission.  Red Hat makes no representations about the

    12  * suitability of this software for any purpose.  It is provided "as is"

    13  * without express or implied warranty.

    14  *

    15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS

    16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND

    17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY

    18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES

    19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN

    20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING

    21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS

    22  * SOFTWARE.

    23  *

    24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)

    25  *          André Tupinambá (andrelrt@gmail.com)

    26  *

    27  * Based on work by Owen Taylor and Søren Sandmann

    28  */

    29 #ifdef HAVE_CONFIG_H

    30 #include <config.h>

    31 #endif

    33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */

    34 #include <emmintrin.h> /* for SSE2 intrinsics */

    35 #include "pixman-private.h"

    36 #include "pixman-combine32.h"

    37 #include "pixman-inlines.h"

    39 static __m128i mask_0080;

    40 static __m128i mask_00ff;

    41 static __m128i mask_0101;

    42 static __m128i mask_ffff;

    43 static __m128i mask_ff000000;

    44 static __m128i mask_alpha;

    46 static __m128i mask_565_r;

    47 static __m128i mask_565_g1, mask_565_g2;

    48 static __m128i mask_565_b;

    49 static __m128i mask_red;

    50 static __m128i mask_green;

    51 static __m128i mask_blue;

    53 static __m128i mask_565_fix_rb;

    54 static __m128i mask_565_fix_g;

    56 static __m128i mask_565_rb;

    57 static __m128i mask_565_pack_multiplier;

    59 static force_inline __m128i

    60 unpack_32_1x128 (uint32_t data)

    61 {

    62     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());

    63 }

    65 static force_inline void

    66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)

    67 {

    68     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());

    69     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());

    70 }

    72 static force_inline __m128i

    73 unpack_565_to_8888 (__m128i lo)

    74 {

    75     __m128i r, g, b, rb, t;

    77     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);

    78     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);

    79     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);

    81     rb = _mm_or_si128 (r, b);

    82     t  = _mm_and_si128 (rb, mask_565_fix_rb);

    83     t  = _mm_srli_epi32 (t, 5);

    84     rb = _mm_or_si128 (rb, t);

    86     t  = _mm_and_si128 (g, mask_565_fix_g);

    87     t  = _mm_srli_epi32 (t, 6);

    88     g  = _mm_or_si128 (g, t);

    90     return _mm_or_si128 (rb, g);

    91 }

    93 static force_inline void

    94 unpack_565_128_4x128 (__m128i  data,

    95                       __m128i* data0,

    96                       __m128i* data1,

    97                       __m128i* data2,

    98                       __m128i* data3)

    99 {

   100     __m128i lo, hi;

   102     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());

   103     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());

   105     lo = unpack_565_to_8888 (lo);

   106     hi = unpack_565_to_8888 (hi);

   108     unpack_128_2x128 (lo, data0, data1);

   109     unpack_128_2x128 (hi, data2, data3);

   110 }

   112 static force_inline uint16_t

   113 pack_565_32_16 (uint32_t pixel)

   114 {

   115     return (uint16_t) (((pixel >> 8) & 0xf800) |

   116 		       ((pixel >> 5) & 0x07e0) |

   117 		       ((pixel >> 3) & 0x001f));

   118 }

   120 static force_inline __m128i

   121 pack_2x128_128 (__m128i lo, __m128i hi)

   122 {

   123     return _mm_packus_epi16 (lo, hi);

   124 }

   126 static force_inline __m128i

   127 pack_565_2packedx128_128 (__m128i lo, __m128i hi)

   128 {

   129     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);

   130     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);

   132     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);

   133     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);

   135     __m128i g0 = _mm_and_si128 (lo, mask_green);

   136     __m128i g1 = _mm_and_si128 (hi, mask_green);

   138     t0 = _mm_or_si128 (t0, g0);

   139     t1 = _mm_or_si128 (t1, g1);

   141     /* Simulates _mm_packus_epi32 */

   142     t0 = _mm_slli_epi32 (t0, 16 - 5);

   143     t1 = _mm_slli_epi32 (t1, 16 - 5);

   144     t0 = _mm_srai_epi32 (t0, 16);

   145     t1 = _mm_srai_epi32 (t1, 16);

   146     return _mm_packs_epi32 (t0, t1);

   147 }

   149 static force_inline __m128i

   150 pack_565_2x128_128 (__m128i lo, __m128i hi)

   151 {

   152     __m128i data;

   153     __m128i r, g1, g2, b;

   155     data = pack_2x128_128 (lo, hi);

   157     r  = _mm_and_si128 (data, mask_565_r);

   158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);

   159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);

   160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);

   162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);

   163 }

   165 static force_inline __m128i

   166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)

   167 {

   168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),

   169 			     pack_565_2x128_128 (*xmm2, *xmm3));

   170 }

   172 static force_inline int

   173 is_opaque (__m128i x)

   174 {

   175     __m128i ffs = _mm_cmpeq_epi8 (x, x);

   177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;

   178 }

   180 static force_inline int

   181 is_zero (__m128i x)

   182 {

   183     return _mm_movemask_epi8 (

   184 	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;

   185 }

   187 static force_inline int

   188 is_transparent (__m128i x)

   189 {

   190     return (_mm_movemask_epi8 (

   191 		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;

   192 }

   194 static force_inline __m128i

   195 expand_pixel_32_1x128 (uint32_t data)

   196 {

   197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));

   198 }

   200 static force_inline __m128i

   201 expand_alpha_1x128 (__m128i data)

   202 {

   203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,

   204 						     _MM_SHUFFLE (3, 3, 3, 3)),

   205 				_MM_SHUFFLE (3, 3, 3, 3));

   206 }

   208 static force_inline void

   209 expand_alpha_2x128 (__m128i  data_lo,

   210                     __m128i  data_hi,

   211                     __m128i* alpha_lo,

   212                     __m128i* alpha_hi)

   213 {

   214     __m128i lo, hi;

   216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));

   217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));

   219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));

   220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));

   221 }

   223 static force_inline void

   224 expand_alpha_rev_2x128 (__m128i  data_lo,

   225                         __m128i  data_hi,

   226                         __m128i* alpha_lo,

   227                         __m128i* alpha_hi)

   228 {

   229     __m128i lo, hi;

   231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));

   232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));

   233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));

   234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));

   235 }

   237 static force_inline void

   238 pix_multiply_2x128 (__m128i* data_lo,

   239                     __m128i* data_hi,

   240                     __m128i* alpha_lo,

   241                     __m128i* alpha_hi,

   242                     __m128i* ret_lo,

   243                     __m128i* ret_hi)

   244 {

   245     __m128i lo, hi;

   247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);

   248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);

   249     lo = _mm_adds_epu16 (lo, mask_0080);

   250     hi = _mm_adds_epu16 (hi, mask_0080);

   251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);

   252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);

   253 }

   255 static force_inline void

   256 pix_add_multiply_2x128 (__m128i* src_lo,

   257                         __m128i* src_hi,

   258                         __m128i* alpha_dst_lo,

   259                         __m128i* alpha_dst_hi,

   260                         __m128i* dst_lo,

   261                         __m128i* dst_hi,

   262                         __m128i* alpha_src_lo,

   263                         __m128i* alpha_src_hi,

   264                         __m128i* ret_lo,

   265                         __m128i* ret_hi)

   266 {

   267     __m128i t1_lo, t1_hi;

   268     __m128i t2_lo, t2_hi;

   270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);

   271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);

   273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);

   274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);

   275 }

   277 static force_inline void

   278 negate_2x128 (__m128i  data_lo,

   279               __m128i  data_hi,

   280               __m128i* neg_lo,

   281               __m128i* neg_hi)

   282 {

   283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);

   284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);

   285 }

   287 static force_inline void

   288 invert_colors_2x128 (__m128i  data_lo,

   289                      __m128i  data_hi,

   290                      __m128i* inv_lo,

   291                      __m128i* inv_hi)

   292 {

   293     __m128i lo, hi;

   295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));

   296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));

   297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));

   298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));

   299 }

   301 static force_inline void

   302 over_2x128 (__m128i* src_lo,

   303             __m128i* src_hi,

   304             __m128i* alpha_lo,

   305             __m128i* alpha_hi,

   306             __m128i* dst_lo,

   307             __m128i* dst_hi)

   308 {

   309     __m128i t1, t2;

   311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);

   313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);

   315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);

   316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);

   317 }

   319 static force_inline void

   320 over_rev_non_pre_2x128 (__m128i  src_lo,

   321                         __m128i  src_hi,

   322                         __m128i* dst_lo,

   323                         __m128i* dst_hi)

   324 {

   325     __m128i lo, hi;

   326     __m128i alpha_lo, alpha_hi;

   328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);

   330     lo = _mm_or_si128 (alpha_lo, mask_alpha);

   331     hi = _mm_or_si128 (alpha_hi, mask_alpha);

   333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);

   335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);

   337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);

   338 }

   340 static force_inline void

   341 in_over_2x128 (__m128i* src_lo,

   342                __m128i* src_hi,

   343                __m128i* alpha_lo,

   344                __m128i* alpha_hi,

   345                __m128i* mask_lo,

   346                __m128i* mask_hi,

   347                __m128i* dst_lo,

   348                __m128i* dst_hi)

   349 {

   350     __m128i s_lo, s_hi;

   351     __m128i a_lo, a_hi;

   353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);

   354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);

   356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);

   357 }

   359 /* load 4 pixels from a 16-byte boundary aligned address */

   360 static force_inline __m128i

   361 load_128_aligned (__m128i* src)

   362 {

   363     return _mm_load_si128 (src);

   364 }

   366 /* load 4 pixels from a unaligned address */

   367 static force_inline __m128i

   368 load_128_unaligned (const __m128i* src)

   369 {

   370     return _mm_loadu_si128 (src);

   371 }

   373 /* save 4 pixels using Write Combining memory on a 16-byte

   374  * boundary aligned address

   375  */

   376 static force_inline void

   377 save_128_write_combining (__m128i* dst,

   378                           __m128i  data)

   379 {

   380     _mm_stream_si128 (dst, data);

   381 }

   383 /* save 4 pixels on a 16-byte boundary aligned address */

   384 static force_inline void

   385 save_128_aligned (__m128i* dst,

   386                   __m128i  data)

   387 {

   388     _mm_store_si128 (dst, data);

   389 }

   391 /* save 4 pixels on a unaligned address */

   392 static force_inline void

   393 save_128_unaligned (__m128i* dst,

   394                     __m128i  data)

   395 {

   396     _mm_storeu_si128 (dst, data);

   397 }

   399 static force_inline __m128i

   400 load_32_1x128 (uint32_t data)

   401 {

   402     return _mm_cvtsi32_si128 (data);

   403 }

   405 static force_inline __m128i

   406 expand_alpha_rev_1x128 (__m128i data)

   407 {

   408     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));

   409 }

   411 static force_inline __m128i

   412 expand_pixel_8_1x128 (uint8_t data)

   413 {

   414     return _mm_shufflelo_epi16 (

   415 	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));

   416 }

   418 static force_inline __m128i

   419 pix_multiply_1x128 (__m128i data,

   420 		    __m128i alpha)

   421 {

   422     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),

   423 					    mask_0080),

   424 			    mask_0101);

   425 }

   427 static force_inline __m128i

   428 pix_add_multiply_1x128 (__m128i* src,

   429 			__m128i* alpha_dst,

   430 			__m128i* dst,

   431 			__m128i* alpha_src)

   432 {

   433     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);

   434     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);

   436     return _mm_adds_epu8 (t1, t2);

   437 }

   439 static force_inline __m128i

   440 negate_1x128 (__m128i data)

   441 {

   442     return _mm_xor_si128 (data, mask_00ff);

   443 }

   445 static force_inline __m128i

   446 invert_colors_1x128 (__m128i data)

   447 {

   448     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));

   449 }

   451 static force_inline __m128i

   452 over_1x128 (__m128i src, __m128i alpha, __m128i dst)

   453 {

   454     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));

   455 }

   457 static force_inline __m128i

   458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)

   459 {

   460     return over_1x128 (pix_multiply_1x128 (*src, *mask),

   461 		       pix_multiply_1x128 (*alpha, *mask),

   462 		       *dst);

   463 }

   465 static force_inline __m128i

   466 over_rev_non_pre_1x128 (__m128i src, __m128i dst)

   467 {

   468     __m128i alpha = expand_alpha_1x128 (src);

   470     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),

   471 					   _mm_or_si128 (alpha, mask_alpha)),

   472 		       alpha,

   473 		       dst);

   474 }

   476 static force_inline uint32_t

   477 pack_1x128_32 (__m128i data)

   478 {

   479     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));

   480 }

   482 static force_inline __m128i

   483 expand565_16_1x128 (uint16_t pixel)

   484 {

   485     __m128i m = _mm_cvtsi32_si128 (pixel);

   487     m = unpack_565_to_8888 (m);

   489     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());

   490 }

   492 static force_inline uint32_t

   493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)

   494 {

   495     uint8_t a;

   496     __m128i xmms;

   498     a = src >> 24;

   500     if (a == 0xff)

   501     {

   502 	return src;

   503     }

   504     else if (src)

   505     {

   506 	xmms = unpack_32_1x128 (src);

   507 	return pack_1x128_32 (

   508 	    over_1x128 (xmms, expand_alpha_1x128 (xmms),

   509 			unpack_32_1x128 (dst)));

   510     }

   512     return dst;

   513 }

   515 static force_inline uint32_t

   516 combine1 (const uint32_t *ps, const uint32_t *pm)

   517 {

   518     uint32_t s = *ps;

   520     if (pm)

   521     {

   522 	__m128i ms, mm;

   524 	mm = unpack_32_1x128 (*pm);

   525 	mm = expand_alpha_1x128 (mm);

   527 	ms = unpack_32_1x128 (s);

   528 	ms = pix_multiply_1x128 (ms, mm);

   530 	s = pack_1x128_32 (ms);

   531     }

   533     return s;

   534 }

   536 static force_inline __m128i

   537 combine4 (const __m128i *ps, const __m128i *pm)

   538 {

   539     __m128i xmm_src_lo, xmm_src_hi;

   540     __m128i xmm_msk_lo, xmm_msk_hi;

   541     __m128i s;

   543     if (pm)

   544     {

   545 	xmm_msk_lo = load_128_unaligned (pm);

   547 	if (is_transparent (xmm_msk_lo))

   548 	    return _mm_setzero_si128 ();

   549     }

   551     s = load_128_unaligned (ps);

   553     if (pm)

   554     {

   555 	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);

   556 	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);

   558 	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);

   560 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

   561 			    &xmm_msk_lo, &xmm_msk_hi,

   562 			    &xmm_src_lo, &xmm_src_hi);

   564 	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);

   565     }

   567     return s;

   568 }

   570 static force_inline void

   571 core_combine_over_u_sse2_mask (uint32_t *	  pd,

   572 			       const uint32_t*    ps,

   573 			       const uint32_t*    pm,

   574 			       int                w)

   575 {

   576     uint32_t s, d;

   578     /* Align dst on a 16-byte boundary */

   579     while (w && ((uintptr_t)pd & 15))

   580     {

   581 	d = *pd;

   582 	s = combine1 (ps, pm);

   584 	if (s)

   585 	    *pd = core_combine_over_u_pixel_sse2 (s, d);

   586 	pd++;

   587 	ps++;

   588 	pm++;

   589 	w--;

   590     }

   592     while (w >= 4)

   593     {

   594 	__m128i mask = load_128_unaligned ((__m128i *)pm);

   596 	if (!is_zero (mask))

   597 	{

   598 	    __m128i src;

   599 	    __m128i src_hi, src_lo;

   600 	    __m128i mask_hi, mask_lo;

   601 	    __m128i alpha_hi, alpha_lo;

   603 	    src = load_128_unaligned ((__m128i *)ps);

   605 	    if (is_opaque (_mm_and_si128 (src, mask)))

   606 	    {

   607 		save_128_aligned ((__m128i *)pd, src);

   608 	    }

   609 	    else

   610 	    {

   611 		__m128i dst = load_128_aligned ((__m128i *)pd);

   612 		__m128i dst_hi, dst_lo;

   614 		unpack_128_2x128 (mask, &mask_lo, &mask_hi);

   615 		unpack_128_2x128 (src, &src_lo, &src_hi);

   617 		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);

   618 		pix_multiply_2x128 (&src_lo, &src_hi,

   619 				    &mask_lo, &mask_hi,

   620 				    &src_lo, &src_hi);

   622 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);

   624 		expand_alpha_2x128 (src_lo, src_hi,

   625 				    &alpha_lo, &alpha_hi);

   627 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,

   628 			    &dst_lo, &dst_hi);

   630 		save_128_aligned (

   631 		    (__m128i *)pd,

   632 		    pack_2x128_128 (dst_lo, dst_hi));

   633 	    }

   634 	}

   636 	pm += 4;

   637 	ps += 4;

   638 	pd += 4;

   639 	w -= 4;

   640     }

   641     while (w)

   642     {

   643 	d = *pd;

   644 	s = combine1 (ps, pm);

   646 	if (s)

   647 	    *pd = core_combine_over_u_pixel_sse2 (s, d);

   648 	pd++;

   649 	ps++;

   650 	pm++;

   652 	w--;

   653     }

   654 }

   656 static force_inline void

   657 core_combine_over_u_sse2_no_mask (uint32_t *	  pd,

   658 				  const uint32_t*    ps,

   659 				  int                w)

   660 {

   661     uint32_t s, d;

   663     /* Align dst on a 16-byte boundary */

   664     while (w && ((uintptr_t)pd & 15))

   665     {

   666 	d = *pd;

   667 	s = *ps;

   669 	if (s)

   670 	    *pd = core_combine_over_u_pixel_sse2 (s, d);

   671 	pd++;

   672 	ps++;

   673 	w--;

   674     }

   676     while (w >= 4)

   677     {

   678 	__m128i src;

   679 	__m128i src_hi, src_lo, dst_hi, dst_lo;

   680 	__m128i alpha_hi, alpha_lo;

   682 	src = load_128_unaligned ((__m128i *)ps);

   684 	if (!is_zero (src))

   685 	{

   686 	    if (is_opaque (src))

   687 	    {

   688 		save_128_aligned ((__m128i *)pd, src);

   689 	    }

   690 	    else

   691 	    {

   692 		__m128i dst = load_128_aligned ((__m128i *)pd);

   694 		unpack_128_2x128 (src, &src_lo, &src_hi);

   695 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);

   697 		expand_alpha_2x128 (src_lo, src_hi,

   698 				    &alpha_lo, &alpha_hi);

   699 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,

   700 			    &dst_lo, &dst_hi);

   702 		save_128_aligned (

   703 		    (__m128i *)pd,

   704 		    pack_2x128_128 (dst_lo, dst_hi));

   705 	    }

   706 	}

   708 	ps += 4;

   709 	pd += 4;

   710 	w -= 4;

   711     }

   712     while (w)

   713     {

   714 	d = *pd;

   715 	s = *ps;

   717 	if (s)

   718 	    *pd = core_combine_over_u_pixel_sse2 (s, d);

   719 	pd++;

   720 	ps++;

   722 	w--;

   723     }

   724 }

   726 static force_inline void

   727 sse2_combine_over_u (pixman_implementation_t *imp,

   728                      pixman_op_t              op,

   729                      uint32_t *               pd,

   730                      const uint32_t *         ps,

   731                      const uint32_t *         pm,

   732                      int                      w)

   733 {

   734     if (pm)

   735 	core_combine_over_u_sse2_mask (pd, ps, pm, w);

   736     else

   737 	core_combine_over_u_sse2_no_mask (pd, ps, w);

   738 }

   740 static void

   741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,

   742                              pixman_op_t              op,

   743                              uint32_t *               pd,

   744                              const uint32_t *         ps,

   745                              const uint32_t *         pm,

   746                              int                      w)

   747 {

   748     uint32_t s, d;

   750     __m128i xmm_dst_lo, xmm_dst_hi;

   751     __m128i xmm_src_lo, xmm_src_hi;

   752     __m128i xmm_alpha_lo, xmm_alpha_hi;

   754     /* Align dst on a 16-byte boundary */

   755     while (w &&

   756            ((uintptr_t)pd & 15))

   757     {

   758 	d = *pd;

   759 	s = combine1 (ps, pm);

   761 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);

   762 	w--;

   763 	ps++;

   764 	if (pm)

   765 	    pm++;

   766     }

   768     while (w >= 4)

   769     {

   770 	/* I'm loading unaligned because I'm not sure

   771 	 * about the address alignment.

   772 	 */

   773 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);

   774 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

   776 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   777 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

   779 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

   780 			    &xmm_alpha_lo, &xmm_alpha_hi);

   782 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,

   783 		    &xmm_alpha_lo, &xmm_alpha_hi,

   784 		    &xmm_src_lo, &xmm_src_hi);

   786 	/* rebuid the 4 pixel data and save*/

   787 	save_128_aligned ((__m128i*)pd,

   788 			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));

   790 	w -= 4;

   791 	ps += 4;

   792 	pd += 4;

   794 	if (pm)

   795 	    pm += 4;

   796     }

   798     while (w)

   799     {

   800 	d = *pd;

   801 	s = combine1 (ps, pm);

   803 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);

   804 	ps++;

   805 	w--;

   806 	if (pm)

   807 	    pm++;

   808     }

   809 }

   811 static force_inline uint32_t

   812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)

   813 {

   814     uint32_t maska = src >> 24;

   816     if (maska == 0)

   817     {

   818 	return 0;

   819     }

   820     else if (maska != 0xff)

   821     {

   822 	return pack_1x128_32 (

   823 	    pix_multiply_1x128 (unpack_32_1x128 (dst),

   824 				expand_alpha_1x128 (unpack_32_1x128 (src))));

   825     }

   827     return dst;

   828 }

   830 static void

   831 sse2_combine_in_u (pixman_implementation_t *imp,

   832                    pixman_op_t              op,

   833                    uint32_t *               pd,

   834                    const uint32_t *         ps,

   835                    const uint32_t *         pm,

   836                    int                      w)

   837 {

   838     uint32_t s, d;

   840     __m128i xmm_src_lo, xmm_src_hi;

   841     __m128i xmm_dst_lo, xmm_dst_hi;

   843     while (w && ((uintptr_t)pd & 15))

   844     {

   845 	s = combine1 (ps, pm);

   846 	d = *pd;

   848 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);

   849 	w--;

   850 	ps++;

   851 	if (pm)

   852 	    pm++;

   853     }

   855     while (w >= 4)

   856     {

   857 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

   858 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);

   860 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

   861 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

   863 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   864 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

   865 			    &xmm_dst_lo, &xmm_dst_hi,

   866 			    &xmm_dst_lo, &xmm_dst_hi);

   868 	save_128_aligned ((__m128i*)pd,

   869 			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

   871 	ps += 4;

   872 	pd += 4;

   873 	w -= 4;

   874 	if (pm)

   875 	    pm += 4;

   876     }

   878     while (w)

   879     {

   880 	s = combine1 (ps, pm);

   881 	d = *pd;

   883 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);

   884 	w--;

   885 	ps++;

   886 	if (pm)

   887 	    pm++;

   888     }

   889 }

   891 static void

   892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,

   893                            pixman_op_t              op,

   894                            uint32_t *               pd,

   895                            const uint32_t *         ps,

   896                            const uint32_t *         pm,

   897                            int                      w)

   898 {

   899     uint32_t s, d;

   901     __m128i xmm_src_lo, xmm_src_hi;

   902     __m128i xmm_dst_lo, xmm_dst_hi;

   904     while (w && ((uintptr_t)pd & 15))

   905     {

   906 	s = combine1 (ps, pm);

   907 	d = *pd;

   909 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);

   910 	ps++;

   911 	w--;

   912 	if (pm)

   913 	    pm++;

   914     }

   916     while (w >= 4)

   917     {

   918 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

   919 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);

   921 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   922 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   924 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

   925 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,

   926 			    &xmm_src_lo, &xmm_src_hi,

   927 			    &xmm_dst_lo, &xmm_dst_hi);

   929 	save_128_aligned (

   930 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

   932 	ps += 4;

   933 	pd += 4;

   934 	w -= 4;

   935 	if (pm)

   936 	    pm += 4;

   937     }

   939     while (w)

   940     {

   941 	s = combine1 (ps, pm);

   942 	d = *pd;

   944 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);

   945 	w--;

   946 	ps++;

   947 	if (pm)

   948 	    pm++;

   949     }

   950 }

   952 static void

   953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,

   954                             pixman_op_t              op,

   955                             uint32_t *               pd,

   956                             const uint32_t *         ps,

   957                             const uint32_t *         pm,

   958                             int                      w)

   959 {

   960     while (w && ((uintptr_t)pd & 15))

   961     {

   962 	uint32_t s = combine1 (ps, pm);

   963 	uint32_t d = *pd;

   965 	*pd++ = pack_1x128_32 (

   966 	    pix_multiply_1x128 (

   967 		unpack_32_1x128 (d), negate_1x128 (

   968 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));

   970 	if (pm)

   971 	    pm++;

   972 	ps++;

   973 	w--;

   974     }

   976     while (w >= 4)

   977     {

   978 	__m128i xmm_src_lo, xmm_src_hi;

   979 	__m128i xmm_dst_lo, xmm_dst_hi;

   981 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);

   982 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

   984 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   985 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

   987 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   988 	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

   990 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,

   991 			    &xmm_src_lo, &xmm_src_hi,

   992 			    &xmm_dst_lo, &xmm_dst_hi);

   994 	save_128_aligned (

   995 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

   997 	ps += 4;

   998 	pd += 4;

   999 	if (pm)

  1000 	    pm += 4;

  1002 	w -= 4;

  1003     }

  1005     while (w)

  1006     {

  1007 	uint32_t s = combine1 (ps, pm);

  1008 	uint32_t d = *pd;

  1010 	*pd++ = pack_1x128_32 (

  1011 	    pix_multiply_1x128 (

  1012 		unpack_32_1x128 (d), negate_1x128 (

  1013 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));

  1014 	ps++;

  1015 	if (pm)

  1016 	    pm++;

  1017 	w--;

  1018     }

  1019 }

  1021 static void

  1022 sse2_combine_out_u (pixman_implementation_t *imp,

  1023                     pixman_op_t              op,

  1024                     uint32_t *               pd,

  1025                     const uint32_t *         ps,

  1026                     const uint32_t *         pm,

  1027                     int                      w)

  1028 {

  1029     while (w && ((uintptr_t)pd & 15))

  1030     {

  1031 	uint32_t s = combine1 (ps, pm);

  1032 	uint32_t d = *pd;

  1034 	*pd++ = pack_1x128_32 (

  1035 	    pix_multiply_1x128 (

  1036 		unpack_32_1x128 (s), negate_1x128 (

  1037 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));

  1038 	w--;

  1039 	ps++;

  1040 	if (pm)

  1041 	    pm++;

  1042     }

  1044     while (w >= 4)

  1045     {

  1046 	__m128i xmm_src_lo, xmm_src_hi;

  1047 	__m128i xmm_dst_lo, xmm_dst_hi;

  1049 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);

  1050 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

  1052 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1053 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1055 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1056 	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1058 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  1059 			    &xmm_dst_lo, &xmm_dst_hi,

  1060 			    &xmm_dst_lo, &xmm_dst_hi);

  1062 	save_128_aligned (

  1063 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1065 	ps += 4;

  1066 	pd += 4;

  1067 	w -= 4;

  1068 	if (pm)

  1069 	    pm += 4;

  1070     }

  1072     while (w)

  1073     {

  1074 	uint32_t s = combine1 (ps, pm);

  1075 	uint32_t d = *pd;

  1077 	*pd++ = pack_1x128_32 (

  1078 	    pix_multiply_1x128 (

  1079 		unpack_32_1x128 (s), negate_1x128 (

  1080 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));

  1081 	w--;

  1082 	ps++;

  1083 	if (pm)

  1084 	    pm++;

  1085     }

  1086 }

  1088 static force_inline uint32_t

  1089 core_combine_atop_u_pixel_sse2 (uint32_t src,

  1090                                 uint32_t dst)

  1091 {

  1092     __m128i s = unpack_32_1x128 (src);

  1093     __m128i d = unpack_32_1x128 (dst);

  1095     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));

  1096     __m128i da = expand_alpha_1x128 (d);

  1098     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));

  1099 }

  1101 static void

  1102 sse2_combine_atop_u (pixman_implementation_t *imp,

  1103                      pixman_op_t              op,

  1104                      uint32_t *               pd,

  1105                      const uint32_t *         ps,

  1106                      const uint32_t *         pm,

  1107                      int                      w)

  1108 {

  1109     uint32_t s, d;

  1111     __m128i xmm_src_lo, xmm_src_hi;

  1112     __m128i xmm_dst_lo, xmm_dst_hi;

  1113     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;

  1114     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;

  1116     while (w && ((uintptr_t)pd & 15))

  1117     {

  1118 	s = combine1 (ps, pm);

  1119 	d = *pd;

  1121 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);

  1122 	w--;

  1123 	ps++;

  1124 	if (pm)

  1125 	    pm++;

  1126     }

  1128     while (w >= 4)

  1129     {

  1130 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);

  1131 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

  1133 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1134 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1136 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  1137 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  1138 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  1139 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  1141 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,

  1142 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  1144 	pix_add_multiply_2x128 (

  1145 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,

  1146 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,

  1147 	    &xmm_dst_lo, &xmm_dst_hi);

  1149 	save_128_aligned (

  1150 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1152 	ps += 4;

  1153 	pd += 4;

  1154 	w -= 4;

  1155 	if (pm)

  1156 	    pm += 4;

  1157     }

  1159     while (w)

  1160     {

  1161 	s = combine1 (ps, pm);

  1162 	d = *pd;

  1164 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);

  1165 	w--;

  1166 	ps++;

  1167 	if (pm)

  1168 	    pm++;

  1169     }

  1170 }

  1172 static force_inline uint32_t

  1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,

  1174                                         uint32_t dst)

  1175 {

  1176     __m128i s = unpack_32_1x128 (src);

  1177     __m128i d = unpack_32_1x128 (dst);

  1179     __m128i sa = expand_alpha_1x128 (s);

  1180     __m128i da = negate_1x128 (expand_alpha_1x128 (d));

  1182     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));

  1183 }

  1185 static void

  1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,

  1187                              pixman_op_t              op,

  1188                              uint32_t *               pd,

  1189                              const uint32_t *         ps,

  1190                              const uint32_t *         pm,

  1191                              int                      w)

  1192 {

  1193     uint32_t s, d;

  1195     __m128i xmm_src_lo, xmm_src_hi;

  1196     __m128i xmm_dst_lo, xmm_dst_hi;

  1197     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;

  1198     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;

  1200     while (w && ((uintptr_t)pd & 15))

  1201     {

  1202 	s = combine1 (ps, pm);

  1203 	d = *pd;

  1205 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);

  1206 	ps++;

  1207 	w--;

  1208 	if (pm)

  1209 	    pm++;

  1210     }

  1212     while (w >= 4)

  1213     {

  1214 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);

  1215 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);

  1217 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1218 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1220 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  1221 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  1222 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  1223 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  1225 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,

  1226 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  1228 	pix_add_multiply_2x128 (

  1229 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,

  1230 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,

  1231 	    &xmm_dst_lo, &xmm_dst_hi);

  1233 	save_128_aligned (

  1234 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1236 	ps += 4;

  1237 	pd += 4;

  1238 	w -= 4;

  1239 	if (pm)

  1240 	    pm += 4;

  1241     }

  1243     while (w)

  1244     {

  1245 	s = combine1 (ps, pm);

  1246 	d = *pd;

  1248 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);

  1249 	ps++;

  1250 	w--;

  1251 	if (pm)

  1252 	    pm++;

  1253     }

  1254 }

  1256 static force_inline uint32_t

  1257 core_combine_xor_u_pixel_sse2 (uint32_t src,

  1258                                uint32_t dst)

  1259 {

  1260     __m128i s = unpack_32_1x128 (src);

  1261     __m128i d = unpack_32_1x128 (dst);

  1263     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));

  1264     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));

  1266     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));

  1267 }

  1269 static void

  1270 sse2_combine_xor_u (pixman_implementation_t *imp,

  1271                     pixman_op_t              op,

  1272                     uint32_t *               dst,

  1273                     const uint32_t *         src,

  1274                     const uint32_t *         mask,

  1275                     int                      width)

  1276 {

  1277     int w = width;

  1278     uint32_t s, d;

  1279     uint32_t* pd = dst;

  1280     const uint32_t* ps = src;

  1281     const uint32_t* pm = mask;

  1283     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  1284     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  1285     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;

  1286     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;

  1288     while (w && ((uintptr_t)pd & 15))

  1289     {

  1290 	s = combine1 (ps, pm);

  1291 	d = *pd;

  1293 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);

  1294 	w--;

  1295 	ps++;

  1296 	if (pm)

  1297 	    pm++;

  1298     }

  1300     while (w >= 4)

  1301     {

  1302 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);

  1303 	xmm_dst = load_128_aligned ((__m128i*) pd);

  1305 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  1306 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  1308 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  1309 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  1310 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  1311 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  1313 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,

  1314 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  1315 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,

  1316 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  1318 	pix_add_multiply_2x128 (

  1319 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,

  1320 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,

  1321 	    &xmm_dst_lo, &xmm_dst_hi);

  1323 	save_128_aligned (

  1324 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1326 	ps += 4;

  1327 	pd += 4;

  1328 	w -= 4;

  1329 	if (pm)

  1330 	    pm += 4;

  1331     }

  1333     while (w)

  1334     {

  1335 	s = combine1 (ps, pm);

  1336 	d = *pd;

  1338 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);

  1339 	w--;

  1340 	ps++;

  1341 	if (pm)

  1342 	    pm++;

  1343     }

  1344 }

  1346 static force_inline void

  1347 sse2_combine_add_u (pixman_implementation_t *imp,

  1348                     pixman_op_t              op,

  1349                     uint32_t *               dst,

  1350                     const uint32_t *         src,

  1351                     const uint32_t *         mask,

  1352                     int                      width)

  1353 {

  1354     int w = width;

  1355     uint32_t s, d;

  1356     uint32_t* pd = dst;

  1357     const uint32_t* ps = src;

  1358     const uint32_t* pm = mask;

  1360     while (w && (uintptr_t)pd & 15)

  1361     {

  1362 	s = combine1 (ps, pm);

  1363 	d = *pd;

  1365 	ps++;

  1366 	if (pm)

  1367 	    pm++;

  1368 	*pd++ = _mm_cvtsi128_si32 (

  1369 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));

  1370 	w--;

  1371     }

  1373     while (w >= 4)

  1374     {

  1375 	__m128i s;

  1377 	s = combine4 ((__m128i*)ps, (__m128i*)pm);

  1379 	save_128_aligned (

  1380 	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));

  1382 	pd += 4;

  1383 	ps += 4;

  1384 	if (pm)

  1385 	    pm += 4;

  1386 	w -= 4;

  1387     }

  1389     while (w--)

  1390     {

  1391 	s = combine1 (ps, pm);

  1392 	d = *pd;

  1394 	ps++;

  1395 	*pd++ = _mm_cvtsi128_si32 (

  1396 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));

  1397 	if (pm)

  1398 	    pm++;

  1399     }

  1400 }

  1402 static force_inline uint32_t

  1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,

  1404                                     uint32_t dst)

  1405 {

  1406     __m128i ms = unpack_32_1x128 (src);

  1407     __m128i md = unpack_32_1x128 (dst);

  1408     uint32_t sa = src >> 24;

  1409     uint32_t da = ~dst >> 24;

  1411     if (sa > da)

  1412     {

  1413 	ms = pix_multiply_1x128 (

  1414 	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));

  1415     }

  1417     return pack_1x128_32 (_mm_adds_epu16 (md, ms));

  1418 }

  1420 static void

  1421 sse2_combine_saturate_u (pixman_implementation_t *imp,

  1422                          pixman_op_t              op,

  1423                          uint32_t *               pd,

  1424                          const uint32_t *         ps,

  1425                          const uint32_t *         pm,

  1426                          int                      w)

  1427 {

  1428     uint32_t s, d;

  1430     uint32_t pack_cmp;

  1431     __m128i xmm_src, xmm_dst;

  1433     while (w && (uintptr_t)pd & 15)

  1434     {

  1435 	s = combine1 (ps, pm);

  1436 	d = *pd;

  1438 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);

  1439 	w--;

  1440 	ps++;

  1441 	if (pm)

  1442 	    pm++;

  1443     }

  1445     while (w >= 4)

  1446     {

  1447 	xmm_dst = load_128_aligned  ((__m128i*)pd);

  1448 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);

  1450 	pack_cmp = _mm_movemask_epi8 (

  1451 	    _mm_cmpgt_epi32 (

  1452 		_mm_srli_epi32 (xmm_src, 24),

  1453 		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));

  1455 	/* if some alpha src is grater than respective ~alpha dst */

  1456 	if (pack_cmp)

  1457 	{

  1458 	    s = combine1 (ps++, pm);

  1459 	    d = *pd;

  1460 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);

  1461 	    if (pm)

  1462 		pm++;

  1464 	    s = combine1 (ps++, pm);

  1465 	    d = *pd;

  1466 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);

  1467 	    if (pm)

  1468 		pm++;

  1470 	    s = combine1 (ps++, pm);

  1471 	    d = *pd;

  1472 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);

  1473 	    if (pm)

  1474 		pm++;

  1476 	    s = combine1 (ps++, pm);

  1477 	    d = *pd;

  1478 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);

  1479 	    if (pm)

  1480 		pm++;

  1481 	}

  1482 	else

  1483 	{

  1484 	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));

  1486 	    pd += 4;

  1487 	    ps += 4;

  1488 	    if (pm)

  1489 		pm += 4;

  1490 	}

  1492 	w -= 4;

  1493     }

  1495     while (w--)

  1496     {

  1497 	s = combine1 (ps, pm);

  1498 	d = *pd;

  1500 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);

  1501 	ps++;

  1502 	if (pm)

  1503 	    pm++;

  1504     }

  1505 }

  1507 static void

  1508 sse2_combine_src_ca (pixman_implementation_t *imp,

  1509                      pixman_op_t              op,

  1510                      uint32_t *               pd,

  1511                      const uint32_t *         ps,

  1512                      const uint32_t *         pm,

  1513                      int                      w)

  1514 {

  1515     uint32_t s, m;

  1517     __m128i xmm_src_lo, xmm_src_hi;

  1518     __m128i xmm_mask_lo, xmm_mask_hi;

  1519     __m128i xmm_dst_lo, xmm_dst_hi;

  1521     while (w && (uintptr_t)pd & 15)

  1522     {

  1523 	s = *ps++;

  1524 	m = *pm++;

  1525 	*pd++ = pack_1x128_32 (

  1526 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));

  1527 	w--;

  1528     }

  1530     while (w >= 4)

  1531     {

  1532 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1533 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1535 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1536 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1538 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  1539 			    &xmm_mask_lo, &xmm_mask_hi,

  1540 			    &xmm_dst_lo, &xmm_dst_hi);

  1542 	save_128_aligned (

  1543 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1545 	ps += 4;

  1546 	pd += 4;

  1547 	pm += 4;

  1548 	w -= 4;

  1549     }

  1551     while (w)

  1552     {

  1553 	s = *ps++;

  1554 	m = *pm++;

  1555 	*pd++ = pack_1x128_32 (

  1556 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));

  1557 	w--;

  1558     }

  1559 }

  1561 static force_inline uint32_t

  1562 core_combine_over_ca_pixel_sse2 (uint32_t src,

  1563                                  uint32_t mask,

  1564                                  uint32_t dst)

  1565 {

  1566     __m128i s = unpack_32_1x128 (src);

  1567     __m128i expAlpha = expand_alpha_1x128 (s);

  1568     __m128i unpk_mask = unpack_32_1x128 (mask);

  1569     __m128i unpk_dst  = unpack_32_1x128 (dst);

  1571     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));

  1572 }

  1574 static void

  1575 sse2_combine_over_ca (pixman_implementation_t *imp,

  1576                       pixman_op_t              op,

  1577                       uint32_t *               pd,

  1578                       const uint32_t *         ps,

  1579                       const uint32_t *         pm,

  1580                       int                      w)

  1581 {

  1582     uint32_t s, m, d;

  1584     __m128i xmm_alpha_lo, xmm_alpha_hi;

  1585     __m128i xmm_src_lo, xmm_src_hi;

  1586     __m128i xmm_dst_lo, xmm_dst_hi;

  1587     __m128i xmm_mask_lo, xmm_mask_hi;

  1589     while (w && (uintptr_t)pd & 15)

  1590     {

  1591 	s = *ps++;

  1592 	m = *pm++;

  1593 	d = *pd;

  1595 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);

  1596 	w--;

  1597     }

  1599     while (w >= 4)

  1600     {

  1601 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  1602 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1603 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1605 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1606 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1607 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1609 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  1610 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1612 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,

  1613 		       &xmm_alpha_lo, &xmm_alpha_hi,

  1614 		       &xmm_mask_lo, &xmm_mask_hi,

  1615 		       &xmm_dst_lo, &xmm_dst_hi);

  1617 	save_128_aligned (

  1618 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1620 	ps += 4;

  1621 	pd += 4;

  1622 	pm += 4;

  1623 	w -= 4;

  1624     }

  1626     while (w)

  1627     {

  1628 	s = *ps++;

  1629 	m = *pm++;

  1630 	d = *pd;

  1632 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);

  1633 	w--;

  1634     }

  1635 }

  1637 static force_inline uint32_t

  1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,

  1639                                          uint32_t mask,

  1640                                          uint32_t dst)

  1641 {

  1642     __m128i d = unpack_32_1x128 (dst);

  1644     return pack_1x128_32 (

  1645 	over_1x128 (d, expand_alpha_1x128 (d),

  1646 		    pix_multiply_1x128 (unpack_32_1x128 (src),

  1647 					unpack_32_1x128 (mask))));

  1648 }

  1650 static void

  1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,

  1652                               pixman_op_t              op,

  1653                               uint32_t *               pd,

  1654                               const uint32_t *         ps,

  1655                               const uint32_t *         pm,

  1656                               int                      w)

  1657 {

  1658     uint32_t s, m, d;

  1660     __m128i xmm_alpha_lo, xmm_alpha_hi;

  1661     __m128i xmm_src_lo, xmm_src_hi;

  1662     __m128i xmm_dst_lo, xmm_dst_hi;

  1663     __m128i xmm_mask_lo, xmm_mask_hi;

  1665     while (w && (uintptr_t)pd & 15)

  1666     {

  1667 	s = *ps++;

  1668 	m = *pm++;

  1669 	d = *pd;

  1671 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);

  1672 	w--;

  1673     }

  1675     while (w >= 4)

  1676     {

  1677 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  1678 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1679 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1681 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1682 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1683 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1685 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  1686 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1687 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  1688 			    &xmm_mask_lo, &xmm_mask_hi,

  1689 			    &xmm_mask_lo, &xmm_mask_hi);

  1691 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,

  1692 		    &xmm_alpha_lo, &xmm_alpha_hi,

  1693 		    &xmm_mask_lo, &xmm_mask_hi);

  1695 	save_128_aligned (

  1696 	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));

  1698 	ps += 4;

  1699 	pd += 4;

  1700 	pm += 4;

  1701 	w -= 4;

  1702     }

  1704     while (w)

  1705     {

  1706 	s = *ps++;

  1707 	m = *pm++;

  1708 	d = *pd;

  1710 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);

  1711 	w--;

  1712     }

  1713 }

  1715 static void

  1716 sse2_combine_in_ca (pixman_implementation_t *imp,

  1717                     pixman_op_t              op,

  1718                     uint32_t *               pd,

  1719                     const uint32_t *         ps,

  1720                     const uint32_t *         pm,

  1721                     int                      w)

  1722 {

  1723     uint32_t s, m, d;

  1725     __m128i xmm_alpha_lo, xmm_alpha_hi;

  1726     __m128i xmm_src_lo, xmm_src_hi;

  1727     __m128i xmm_dst_lo, xmm_dst_hi;

  1728     __m128i xmm_mask_lo, xmm_mask_hi;

  1730     while (w && (uintptr_t)pd & 15)

  1731     {

  1732 	s = *ps++;

  1733 	m = *pm++;

  1734 	d = *pd;

  1736 	*pd++ = pack_1x128_32 (

  1737 	    pix_multiply_1x128 (

  1738 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),

  1739 		expand_alpha_1x128 (unpack_32_1x128 (d))));

  1741 	w--;

  1742     }

  1744     while (w >= 4)

  1745     {

  1746 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  1747 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1748 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1750 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1751 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1752 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1754 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  1755 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1757 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  1758 			    &xmm_mask_lo, &xmm_mask_hi,

  1759 			    &xmm_dst_lo, &xmm_dst_hi);

  1761 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,

  1762 			    &xmm_alpha_lo, &xmm_alpha_hi,

  1763 			    &xmm_dst_lo, &xmm_dst_hi);

  1765 	save_128_aligned (

  1766 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1768 	ps += 4;

  1769 	pd += 4;

  1770 	pm += 4;

  1771 	w -= 4;

  1772     }

  1774     while (w)

  1775     {

  1776 	s = *ps++;

  1777 	m = *pm++;

  1778 	d = *pd;

  1780 	*pd++ = pack_1x128_32 (

  1781 	    pix_multiply_1x128 (

  1782 		pix_multiply_1x128 (

  1783 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),

  1784 		expand_alpha_1x128 (unpack_32_1x128 (d))));

  1786 	w--;

  1787     }

  1788 }

  1790 static void

  1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,

  1792                             pixman_op_t              op,

  1793                             uint32_t *               pd,

  1794                             const uint32_t *         ps,

  1795                             const uint32_t *         pm,

  1796                             int                      w)

  1797 {

  1798     uint32_t s, m, d;

  1800     __m128i xmm_alpha_lo, xmm_alpha_hi;

  1801     __m128i xmm_src_lo, xmm_src_hi;

  1802     __m128i xmm_dst_lo, xmm_dst_hi;

  1803     __m128i xmm_mask_lo, xmm_mask_hi;

  1805     while (w && (uintptr_t)pd & 15)

  1806     {

  1807 	s = *ps++;

  1808 	m = *pm++;

  1809 	d = *pd;

  1811 	*pd++ = pack_1x128_32 (

  1812 	    pix_multiply_1x128 (

  1813 		unpack_32_1x128 (d),

  1814 		pix_multiply_1x128 (unpack_32_1x128 (m),

  1815 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));

  1816 	w--;

  1817     }

  1819     while (w >= 4)

  1820     {

  1821 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  1822 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1823 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1825 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1826 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1827 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1829 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  1830 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1831 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,

  1832 			    &xmm_alpha_lo, &xmm_alpha_hi,

  1833 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1835 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,

  1836 			    &xmm_alpha_lo, &xmm_alpha_hi,

  1837 			    &xmm_dst_lo, &xmm_dst_hi);

  1839 	save_128_aligned (

  1840 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1842 	ps += 4;

  1843 	pd += 4;

  1844 	pm += 4;

  1845 	w -= 4;

  1846     }

  1848     while (w)

  1849     {

  1850 	s = *ps++;

  1851 	m = *pm++;

  1852 	d = *pd;

  1854 	*pd++ = pack_1x128_32 (

  1855 	    pix_multiply_1x128 (

  1856 		unpack_32_1x128 (d),

  1857 		pix_multiply_1x128 (unpack_32_1x128 (m),

  1858 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));

  1859 	w--;

  1860     }

  1861 }

  1863 static void

  1864 sse2_combine_out_ca (pixman_implementation_t *imp,

  1865                      pixman_op_t              op,

  1866                      uint32_t *               pd,

  1867                      const uint32_t *         ps,

  1868                      const uint32_t *         pm,

  1869                      int                      w)

  1870 {

  1871     uint32_t s, m, d;

  1873     __m128i xmm_alpha_lo, xmm_alpha_hi;

  1874     __m128i xmm_src_lo, xmm_src_hi;

  1875     __m128i xmm_dst_lo, xmm_dst_hi;

  1876     __m128i xmm_mask_lo, xmm_mask_hi;

  1878     while (w && (uintptr_t)pd & 15)

  1879     {

  1880 	s = *ps++;

  1881 	m = *pm++;

  1882 	d = *pd;

  1884 	*pd++ = pack_1x128_32 (

  1885 	    pix_multiply_1x128 (

  1886 		pix_multiply_1x128 (

  1887 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),

  1888 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));

  1889 	w--;

  1890     }

  1892     while (w >= 4)

  1893     {

  1894 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  1895 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1896 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1898 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1899 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1900 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1902 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  1903 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1904 	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,

  1905 		      &xmm_alpha_lo, &xmm_alpha_hi);

  1907 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  1908 			    &xmm_mask_lo, &xmm_mask_hi,

  1909 			    &xmm_dst_lo, &xmm_dst_hi);

  1910 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,

  1911 			    &xmm_alpha_lo, &xmm_alpha_hi,

  1912 			    &xmm_dst_lo, &xmm_dst_hi);

  1914 	save_128_aligned (

  1915 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1917 	ps += 4;

  1918 	pd += 4;

  1919 	pm += 4;

  1920 	w -= 4;

  1921     }

  1923     while (w)

  1924     {

  1925 	s = *ps++;

  1926 	m = *pm++;

  1927 	d = *pd;

  1929 	*pd++ = pack_1x128_32 (

  1930 	    pix_multiply_1x128 (

  1931 		pix_multiply_1x128 (

  1932 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),

  1933 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));

  1935 	w--;

  1936     }

  1937 }

  1939 static void

  1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,

  1941                              pixman_op_t              op,

  1942                              uint32_t *               pd,

  1943                              const uint32_t *         ps,

  1944                              const uint32_t *         pm,

  1945                              int                      w)

  1946 {

  1947     uint32_t s, m, d;

  1949     __m128i xmm_alpha_lo, xmm_alpha_hi;

  1950     __m128i xmm_src_lo, xmm_src_hi;

  1951     __m128i xmm_dst_lo, xmm_dst_hi;

  1952     __m128i xmm_mask_lo, xmm_mask_hi;

  1954     while (w && (uintptr_t)pd & 15)

  1955     {

  1956 	s = *ps++;

  1957 	m = *pm++;

  1958 	d = *pd;

  1960 	*pd++ = pack_1x128_32 (

  1961 	    pix_multiply_1x128 (

  1962 		unpack_32_1x128 (d),

  1963 		negate_1x128 (pix_multiply_1x128 (

  1964 				 unpack_32_1x128 (m),

  1965 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));

  1966 	w--;

  1967     }

  1969     while (w >= 4)

  1970     {

  1971 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  1972 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  1973 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  1975 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  1976 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  1977 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  1979 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  1980 			    &xmm_alpha_lo, &xmm_alpha_hi);

  1982 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,

  1983 			    &xmm_alpha_lo, &xmm_alpha_hi,

  1984 			    &xmm_mask_lo, &xmm_mask_hi);

  1986 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,

  1987 		      &xmm_mask_lo, &xmm_mask_hi);

  1989 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,

  1990 			    &xmm_mask_lo, &xmm_mask_hi,

  1991 			    &xmm_dst_lo, &xmm_dst_hi);

  1993 	save_128_aligned (

  1994 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  1996 	ps += 4;

  1997 	pd += 4;

  1998 	pm += 4;

  1999 	w -= 4;

  2000     }

  2002     while (w)

  2003     {

  2004 	s = *ps++;

  2005 	m = *pm++;

  2006 	d = *pd;

  2008 	*pd++ = pack_1x128_32 (

  2009 	    pix_multiply_1x128 (

  2010 		unpack_32_1x128 (d),

  2011 		negate_1x128 (pix_multiply_1x128 (

  2012 				 unpack_32_1x128 (m),

  2013 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));

  2014 	w--;

  2015     }

  2016 }

  2018 static force_inline uint32_t

  2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,

  2020                                  uint32_t mask,

  2021                                  uint32_t dst)

  2022 {

  2023     __m128i m = unpack_32_1x128 (mask);

  2024     __m128i s = unpack_32_1x128 (src);

  2025     __m128i d = unpack_32_1x128 (dst);

  2026     __m128i sa = expand_alpha_1x128 (s);

  2027     __m128i da = expand_alpha_1x128 (d);

  2029     s = pix_multiply_1x128 (s, m);

  2030     m = negate_1x128 (pix_multiply_1x128 (m, sa));

  2032     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));

  2033 }

  2035 static void

  2036 sse2_combine_atop_ca (pixman_implementation_t *imp,

  2037                       pixman_op_t              op,

  2038                       uint32_t *               pd,

  2039                       const uint32_t *         ps,

  2040                       const uint32_t *         pm,

  2041                       int                      w)

  2042 {

  2043     uint32_t s, m, d;

  2045     __m128i xmm_src_lo, xmm_src_hi;

  2046     __m128i xmm_dst_lo, xmm_dst_hi;

  2047     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;

  2048     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;

  2049     __m128i xmm_mask_lo, xmm_mask_hi;

  2051     while (w && (uintptr_t)pd & 15)

  2052     {

  2053 	s = *ps++;

  2054 	m = *pm++;

  2055 	d = *pd;

  2057 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);

  2058 	w--;

  2059     }

  2061     while (w >= 4)

  2062     {

  2063 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  2064 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  2065 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  2067 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  2068 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  2069 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  2071 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  2072 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  2073 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  2074 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  2076 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  2077 			    &xmm_mask_lo, &xmm_mask_hi,

  2078 			    &xmm_src_lo, &xmm_src_hi);

  2079 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,

  2080 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,

  2081 			    &xmm_mask_lo, &xmm_mask_hi);

  2083 	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  2085 	pix_add_multiply_2x128 (

  2086 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,

  2087 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,

  2088 	    &xmm_dst_lo, &xmm_dst_hi);

  2090 	save_128_aligned (

  2091 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  2093 	ps += 4;

  2094 	pd += 4;

  2095 	pm += 4;

  2096 	w -= 4;

  2097     }

  2099     while (w)

  2100     {

  2101 	s = *ps++;

  2102 	m = *pm++;

  2103 	d = *pd;

  2105 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);

  2106 	w--;

  2107     }

  2108 }

  2110 static force_inline uint32_t

  2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,

  2112                                          uint32_t mask,

  2113                                          uint32_t dst)

  2114 {

  2115     __m128i m = unpack_32_1x128 (mask);

  2116     __m128i s = unpack_32_1x128 (src);

  2117     __m128i d = unpack_32_1x128 (dst);

  2119     __m128i da = negate_1x128 (expand_alpha_1x128 (d));

  2120     __m128i sa = expand_alpha_1x128 (s);

  2122     s = pix_multiply_1x128 (s, m);

  2123     m = pix_multiply_1x128 (m, sa);

  2125     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));

  2126 }

  2128 static void

  2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,

  2130                               pixman_op_t              op,

  2131                               uint32_t *               pd,

  2132                               const uint32_t *         ps,

  2133                               const uint32_t *         pm,

  2134                               int                      w)

  2135 {

  2136     uint32_t s, m, d;

  2138     __m128i xmm_src_lo, xmm_src_hi;

  2139     __m128i xmm_dst_lo, xmm_dst_hi;

  2140     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;

  2141     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;

  2142     __m128i xmm_mask_lo, xmm_mask_hi;

  2144     while (w && (uintptr_t)pd & 15)

  2145     {

  2146 	s = *ps++;

  2147 	m = *pm++;

  2148 	d = *pd;

  2150 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);

  2151 	w--;

  2152     }

  2154     while (w >= 4)

  2155     {

  2156 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  2157 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  2158 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  2160 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  2161 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  2162 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  2164 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  2165 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  2166 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  2167 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  2169 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  2170 			    &xmm_mask_lo, &xmm_mask_hi,

  2171 			    &xmm_src_lo, &xmm_src_hi);

  2172 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,

  2173 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,

  2174 			    &xmm_mask_lo, &xmm_mask_hi);

  2176 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,

  2177 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  2179 	pix_add_multiply_2x128 (

  2180 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,

  2181 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,

  2182 	    &xmm_dst_lo, &xmm_dst_hi);

  2184 	save_128_aligned (

  2185 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  2187 	ps += 4;

  2188 	pd += 4;

  2189 	pm += 4;

  2190 	w -= 4;

  2191     }

  2193     while (w)

  2194     {

  2195 	s = *ps++;

  2196 	m = *pm++;

  2197 	d = *pd;

  2199 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);

  2200 	w--;

  2201     }

  2202 }

  2204 static force_inline uint32_t

  2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,

  2206                                 uint32_t mask,

  2207                                 uint32_t dst)

  2208 {

  2209     __m128i a = unpack_32_1x128 (mask);

  2210     __m128i s = unpack_32_1x128 (src);

  2211     __m128i d = unpack_32_1x128 (dst);

  2213     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (

  2214 				       a, expand_alpha_1x128 (s)));

  2215     __m128i dest      = pix_multiply_1x128 (s, a);

  2216     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));

  2218     return pack_1x128_32 (pix_add_multiply_1x128 (&d,

  2219                                                 &alpha_dst,

  2220                                                 &dest,

  2221                                                 &alpha_src));

  2222 }

  2224 static void

  2225 sse2_combine_xor_ca (pixman_implementation_t *imp,

  2226                      pixman_op_t              op,

  2227                      uint32_t *               pd,

  2228                      const uint32_t *         ps,

  2229                      const uint32_t *         pm,

  2230                      int                      w)

  2231 {

  2232     uint32_t s, m, d;

  2234     __m128i xmm_src_lo, xmm_src_hi;

  2235     __m128i xmm_dst_lo, xmm_dst_hi;

  2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;

  2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;

  2238     __m128i xmm_mask_lo, xmm_mask_hi;

  2240     while (w && (uintptr_t)pd & 15)

  2241     {

  2242 	s = *ps++;

  2243 	m = *pm++;

  2244 	d = *pd;

  2246 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);

  2247 	w--;

  2248     }

  2250     while (w >= 4)

  2251     {

  2252 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  2253 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  2254 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  2256 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  2257 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  2258 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  2260 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  2261 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);

  2262 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,

  2263 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  2265 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  2266 			    &xmm_mask_lo, &xmm_mask_hi,

  2267 			    &xmm_src_lo, &xmm_src_hi);

  2268 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,

  2269 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,

  2270 			    &xmm_mask_lo, &xmm_mask_hi);

  2272 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,

  2273 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);

  2274 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,

  2275 		      &xmm_mask_lo, &xmm_mask_hi);

  2277 	pix_add_multiply_2x128 (

  2278 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,

  2279 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,

  2280 	    &xmm_dst_lo, &xmm_dst_hi);

  2282 	save_128_aligned (

  2283 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  2285 	ps += 4;

  2286 	pd += 4;

  2287 	pm += 4;

  2288 	w -= 4;

  2289     }

  2291     while (w)

  2292     {

  2293 	s = *ps++;

  2294 	m = *pm++;

  2295 	d = *pd;

  2297 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);

  2298 	w--;

  2299     }

  2300 }

  2302 static void

  2303 sse2_combine_add_ca (pixman_implementation_t *imp,

  2304                      pixman_op_t              op,

  2305                      uint32_t *               pd,

  2306                      const uint32_t *         ps,

  2307                      const uint32_t *         pm,

  2308                      int                      w)

  2309 {

  2310     uint32_t s, m, d;

  2312     __m128i xmm_src_lo, xmm_src_hi;

  2313     __m128i xmm_dst_lo, xmm_dst_hi;

  2314     __m128i xmm_mask_lo, xmm_mask_hi;

  2316     while (w && (uintptr_t)pd & 15)

  2317     {

  2318 	s = *ps++;

  2319 	m = *pm++;

  2320 	d = *pd;

  2322 	*pd++ = pack_1x128_32 (

  2323 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),

  2324 					       unpack_32_1x128 (m)),

  2325 			   unpack_32_1x128 (d)));

  2326 	w--;

  2327     }

  2329     while (w >= 4)

  2330     {

  2331 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);

  2332 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);

  2333 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);

  2335 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  2336 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  2337 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  2339 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  2340 			    &xmm_mask_lo, &xmm_mask_hi,

  2341 			    &xmm_src_lo, &xmm_src_hi);

  2343 	save_128_aligned (

  2344 	    (__m128i*)pd, pack_2x128_128 (

  2345 		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),

  2346 		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));

  2348 	ps += 4;

  2349 	pd += 4;

  2350 	pm += 4;

  2351 	w -= 4;

  2352     }

  2354     while (w)

  2355     {

  2356 	s = *ps++;

  2357 	m = *pm++;

  2358 	d = *pd;

  2360 	*pd++ = pack_1x128_32 (

  2361 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),

  2362 					       unpack_32_1x128 (m)),

  2363 			   unpack_32_1x128 (d)));

  2364 	w--;

  2365     }

  2366 }

  2368 static force_inline __m128i

  2369 create_mask_16_128 (uint16_t mask)

  2370 {

  2371     return _mm_set1_epi16 (mask);

  2372 }

  2374 /* Work around a code generation bug in Sun Studio 12. */

  2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)

  2376 # define create_mask_2x32_128(mask0, mask1)				\

  2377     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))

  2378 #else

  2379 static force_inline __m128i

  2380 create_mask_2x32_128 (uint32_t mask0,

  2381                       uint32_t mask1)

  2382 {

  2383     return _mm_set_epi32 (mask0, mask1, mask0, mask1);

  2384 }

  2385 #endif

  2387 static void

  2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp,

  2389                             pixman_composite_info_t *info)

  2390 {

  2391     PIXMAN_COMPOSITE_ARGS (info);

  2392     uint32_t src;

  2393     uint32_t    *dst_line, *dst, d;

  2394     int32_t w;

  2395     int dst_stride;

  2396     __m128i xmm_src, xmm_alpha;

  2397     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  2399     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  2401     if (src == 0)

  2402 	return;

  2404     PIXMAN_IMAGE_GET_LINE (

  2405 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  2407     xmm_src = expand_pixel_32_1x128 (src);

  2408     xmm_alpha = expand_alpha_1x128 (xmm_src);

  2410     while (height--)

  2411     {

  2412 	dst = dst_line;

  2414 	dst_line += dst_stride;

  2415 	w = width;

  2417 	while (w && (uintptr_t)dst & 15)

  2418 	{

  2419 	    d = *dst;

  2420 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,

  2421 						xmm_alpha,

  2422 						unpack_32_1x128 (d)));

  2423 	    w--;

  2424 	}

  2426 	while (w >= 4)

  2427 	{

  2428 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  2430 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  2432 	    over_2x128 (&xmm_src, &xmm_src,

  2433 			&xmm_alpha, &xmm_alpha,

  2434 			&xmm_dst_lo, &xmm_dst_hi);

  2436 	    /* rebuid the 4 pixel data and save*/

  2437 	    save_128_aligned (

  2438 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  2440 	    w -= 4;

  2441 	    dst += 4;

  2442 	}

  2444 	while (w)

  2445 	{

  2446 	    d = *dst;

  2447 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,

  2448 						xmm_alpha,

  2449 						unpack_32_1x128 (d)));

  2450 	    w--;

  2451 	}

  2453     }

  2454 }

  2456 static void

  2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp,

  2458                             pixman_composite_info_t *info)

  2459 {

  2460     PIXMAN_COMPOSITE_ARGS (info);

  2461     uint32_t src;

  2462     uint16_t    *dst_line, *dst, d;

  2463     int32_t w;

  2464     int dst_stride;

  2465     __m128i xmm_src, xmm_alpha;

  2466     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;

  2468     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  2470     if (src == 0)

  2471 	return;

  2473     PIXMAN_IMAGE_GET_LINE (

  2474 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);

  2476     xmm_src = expand_pixel_32_1x128 (src);

  2477     xmm_alpha = expand_alpha_1x128 (xmm_src);

  2479     while (height--)

  2480     {

  2481 	dst = dst_line;

  2483 	dst_line += dst_stride;

  2484 	w = width;

  2486 	while (w && (uintptr_t)dst & 15)

  2487 	{

  2488 	    d = *dst;

  2490 	    *dst++ = pack_565_32_16 (

  2491 		pack_1x128_32 (over_1x128 (xmm_src,

  2492 					   xmm_alpha,

  2493 					   expand565_16_1x128 (d))));

  2494 	    w--;

  2495 	}

  2497 	while (w >= 8)

  2498 	{

  2499 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  2501 	    unpack_565_128_4x128 (xmm_dst,

  2502 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);

  2504 	    over_2x128 (&xmm_src, &xmm_src,

  2505 			&xmm_alpha, &xmm_alpha,

  2506 			&xmm_dst0, &xmm_dst1);

  2507 	    over_2x128 (&xmm_src, &xmm_src,

  2508 			&xmm_alpha, &xmm_alpha,

  2509 			&xmm_dst2, &xmm_dst3);

  2511 	    xmm_dst = pack_565_4x128_128 (

  2512 		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);

  2514 	    save_128_aligned ((__m128i*)dst, xmm_dst);

  2516 	    dst += 8;

  2517 	    w -= 8;

  2518 	}

  2520 	while (w--)

  2521 	{

  2522 	    d = *dst;

  2523 	    *dst++ = pack_565_32_16 (

  2524 		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,

  2525 					   expand565_16_1x128 (d))));

  2526 	}

  2527     }

  2529 }

  2531 static void

  2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,

  2533 				   pixman_composite_info_t *info)

  2534 {

  2535     PIXMAN_COMPOSITE_ARGS (info);

  2536     uint32_t src;

  2537     uint32_t    *dst_line, d;

  2538     uint32_t    *mask_line, m;

  2539     uint32_t pack_cmp;

  2540     int dst_stride, mask_stride;

  2542     __m128i xmm_src;

  2543     __m128i xmm_dst;

  2544     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  2546     __m128i mmx_src, mmx_mask, mmx_dest;

  2548     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  2550     if (src == 0)

  2551 	return;

  2553     PIXMAN_IMAGE_GET_LINE (

  2554 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  2555     PIXMAN_IMAGE_GET_LINE (

  2556 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);

  2558     xmm_src = _mm_unpacklo_epi8 (

  2559 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());

  2560     mmx_src   = xmm_src;

  2562     while (height--)

  2563     {

  2564 	int w = width;

  2565 	const uint32_t *pm = (uint32_t *)mask_line;

  2566 	uint32_t *pd = (uint32_t *)dst_line;

  2568 	dst_line += dst_stride;

  2569 	mask_line += mask_stride;

  2571 	while (w && (uintptr_t)pd & 15)

  2572 	{

  2573 	    m = *pm++;

  2575 	    if (m)

  2576 	    {

  2577 		d = *pd;

  2579 		mmx_mask = unpack_32_1x128 (m);

  2580 		mmx_dest = unpack_32_1x128 (d);

  2582 		*pd = pack_1x128_32 (

  2583 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),

  2584 				   mmx_dest));

  2585 	    }

  2587 	    pd++;

  2588 	    w--;

  2589 	}

  2591 	while (w >= 4)

  2592 	{

  2593 	    xmm_mask = load_128_unaligned ((__m128i*)pm);

  2595 	    pack_cmp =

  2596 		_mm_movemask_epi8 (

  2597 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));

  2599 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */

  2600 	    if (pack_cmp != 0xffff)

  2601 	    {

  2602 		xmm_dst = load_128_aligned ((__m128i*)pd);

  2604 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  2606 		pix_multiply_2x128 (&xmm_src, &xmm_src,

  2607 				    &xmm_mask_lo, &xmm_mask_hi,

  2608 				    &xmm_mask_lo, &xmm_mask_hi);

  2609 		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);

  2611 		save_128_aligned (

  2612 		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));

  2613 	    }

  2615 	    pd += 4;

  2616 	    pm += 4;

  2617 	    w -= 4;

  2618 	}

  2620 	while (w)

  2621 	{

  2622 	    m = *pm++;

  2624 	    if (m)

  2625 	    {

  2626 		d = *pd;

  2628 		mmx_mask = unpack_32_1x128 (m);

  2629 		mmx_dest = unpack_32_1x128 (d);

  2631 		*pd = pack_1x128_32 (

  2632 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),

  2633 				   mmx_dest));

  2634 	    }

  2636 	    pd++;

  2637 	    w--;

  2638 	}

  2639     }

  2641 }

  2643 static void

  2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,

  2645                                     pixman_composite_info_t *info)

  2646 {

  2647     PIXMAN_COMPOSITE_ARGS (info);

  2648     uint32_t src;

  2649     uint32_t    *dst_line, d;

  2650     uint32_t    *mask_line, m;

  2651     uint32_t pack_cmp;

  2652     int dst_stride, mask_stride;

  2654     __m128i xmm_src, xmm_alpha;

  2655     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  2656     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  2658     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;

  2660     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  2662     if (src == 0)

  2663 	return;

  2665     PIXMAN_IMAGE_GET_LINE (

  2666 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  2667     PIXMAN_IMAGE_GET_LINE (

  2668 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);

  2670     xmm_src = _mm_unpacklo_epi8 (

  2671 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());

  2672     xmm_alpha = expand_alpha_1x128 (xmm_src);

  2673     mmx_src   = xmm_src;

  2674     mmx_alpha = xmm_alpha;

  2676     while (height--)

  2677     {

  2678 	int w = width;

  2679 	const uint32_t *pm = (uint32_t *)mask_line;

  2680 	uint32_t *pd = (uint32_t *)dst_line;

  2682 	dst_line += dst_stride;

  2683 	mask_line += mask_stride;

  2685 	while (w && (uintptr_t)pd & 15)

  2686 	{

  2687 	    m = *pm++;

  2689 	    if (m)

  2690 	    {

  2691 		d = *pd;

  2692 		mmx_mask = unpack_32_1x128 (m);

  2693 		mmx_dest = unpack_32_1x128 (d);

  2695 		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,

  2696 		                                  &mmx_alpha,

  2697 		                                  &mmx_mask,

  2698 		                                  &mmx_dest));

  2699 	    }

  2701 	    pd++;

  2702 	    w--;

  2703 	}

  2705 	while (w >= 4)

  2706 	{

  2707 	    xmm_mask = load_128_unaligned ((__m128i*)pm);

  2709 	    pack_cmp =

  2710 		_mm_movemask_epi8 (

  2711 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));

  2713 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */

  2714 	    if (pack_cmp != 0xffff)

  2715 	    {

  2716 		xmm_dst = load_128_aligned ((__m128i*)pd);

  2718 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  2719 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  2721 		in_over_2x128 (&xmm_src, &xmm_src,

  2722 			       &xmm_alpha, &xmm_alpha,

  2723 			       &xmm_mask_lo, &xmm_mask_hi,

  2724 			       &xmm_dst_lo, &xmm_dst_hi);

  2726 		save_128_aligned (

  2727 		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  2728 	    }

  2730 	    pd += 4;

  2731 	    pm += 4;

  2732 	    w -= 4;

  2733 	}

  2735 	while (w)

  2736 	{

  2737 	    m = *pm++;

  2739 	    if (m)

  2740 	    {

  2741 		d = *pd;

  2742 		mmx_mask = unpack_32_1x128 (m);

  2743 		mmx_dest = unpack_32_1x128 (d);

  2745 		*pd = pack_1x128_32 (

  2746 		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));

  2747 	    }

  2749 	    pd++;

  2750 	    w--;

  2751 	}

  2752     }

  2754 }

  2756 static void

  2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,

  2758                                  pixman_composite_info_t *info)

  2759 {

  2760     PIXMAN_COMPOSITE_ARGS (info);

  2761     uint32_t    *dst_line, *dst;

  2762     uint32_t    *src_line, *src;

  2763     uint32_t mask;

  2764     int32_t w;

  2765     int dst_stride, src_stride;

  2767     __m128i xmm_mask;

  2768     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  2769     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  2770     __m128i xmm_alpha_lo, xmm_alpha_hi;

  2772     PIXMAN_IMAGE_GET_LINE (

  2773 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  2774     PIXMAN_IMAGE_GET_LINE (

  2775 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  2777     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);

  2779     xmm_mask = create_mask_16_128 (mask >> 24);

  2781     while (height--)

  2782     {

  2783 	dst = dst_line;

  2784 	dst_line += dst_stride;

  2785 	src = src_line;

  2786 	src_line += src_stride;

  2787 	w = width;

  2789 	while (w && (uintptr_t)dst & 15)

  2790 	{

  2791 	    uint32_t s = *src++;

  2793 	    if (s)

  2794 	    {

  2795 		uint32_t d = *dst;

  2797 		__m128i ms = unpack_32_1x128 (s);

  2798 		__m128i alpha    = expand_alpha_1x128 (ms);

  2799 		__m128i dest     = xmm_mask;

  2800 		__m128i alpha_dst = unpack_32_1x128 (d);

  2802 		*dst = pack_1x128_32 (

  2803 		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));

  2804 	    }

  2805 	    dst++;

  2806 	    w--;

  2807 	}

  2809 	while (w >= 4)

  2810 	{

  2811 	    xmm_src = load_128_unaligned ((__m128i*)src);

  2813 	    if (!is_zero (xmm_src))

  2814 	    {

  2815 		xmm_dst = load_128_aligned ((__m128i*)dst);

  2817 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  2818 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  2819 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  2820 				    &xmm_alpha_lo, &xmm_alpha_hi);

  2822 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,

  2823 			       &xmm_alpha_lo, &xmm_alpha_hi,

  2824 			       &xmm_mask, &xmm_mask,

  2825 			       &xmm_dst_lo, &xmm_dst_hi);

  2827 		save_128_aligned (

  2828 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  2829 	    }

  2831 	    dst += 4;

  2832 	    src += 4;

  2833 	    w -= 4;

  2834 	}

  2836 	while (w)

  2837 	{

  2838 	    uint32_t s = *src++;

  2840 	    if (s)

  2841 	    {

  2842 		uint32_t d = *dst;

  2844 		__m128i ms = unpack_32_1x128 (s);

  2845 		__m128i alpha = expand_alpha_1x128 (ms);

  2846 		__m128i mask  = xmm_mask;

  2847 		__m128i dest  = unpack_32_1x128 (d);

  2849 		*dst = pack_1x128_32 (

  2850 		    in_over_1x128 (&ms, &alpha, &mask, &dest));

  2851 	    }

  2853 	    dst++;

  2854 	    w--;

  2855 	}

  2856     }

  2858 }

  2860 static void

  2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,

  2862                               pixman_composite_info_t *info)

  2863 {

  2864     PIXMAN_COMPOSITE_ARGS (info);

  2865     uint16_t    *dst_line, *dst;

  2866     uint32_t    *src_line, *src, s;

  2867     int dst_stride, src_stride;

  2868     int32_t w;

  2870     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  2871     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);

  2873     while (height--)

  2874     {

  2875 	dst = dst_line;

  2876 	dst_line += dst_stride;

  2877 	src = src_line;

  2878 	src_line += src_stride;

  2879 	w = width;

  2881 	while (w && (uintptr_t)dst & 15)

  2882 	{

  2883 	    s = *src++;

  2884 	    *dst = convert_8888_to_0565 (s);

  2885 	    dst++;

  2886 	    w--;

  2887 	}

  2889 	while (w >= 8)

  2890 	{

  2891 	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);

  2892 	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);

  2894 	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));

  2896 	    w -= 8;

  2897 	    src += 8;

  2898 	    dst += 8;

  2899 	}

  2901 	while (w)

  2902 	{

  2903 	    s = *src++;

  2904 	    *dst = convert_8888_to_0565 (s);

  2905 	    dst++;

  2906 	    w--;

  2907 	}

  2908     }

  2909 }

  2911 static void

  2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,

  2913 			      pixman_composite_info_t *info)

  2914 {

  2915     PIXMAN_COMPOSITE_ARGS (info);

  2916     uint32_t    *dst_line, *dst;

  2917     uint32_t    *src_line, *src;

  2918     int32_t w;

  2919     int dst_stride, src_stride;

  2922     PIXMAN_IMAGE_GET_LINE (

  2923 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  2924     PIXMAN_IMAGE_GET_LINE (

  2925 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  2927     while (height--)

  2928     {

  2929 	dst = dst_line;

  2930 	dst_line += dst_stride;

  2931 	src = src_line;

  2932 	src_line += src_stride;

  2933 	w = width;

  2935 	while (w && (uintptr_t)dst & 15)

  2936 	{

  2937 	    *dst++ = *src++ | 0xff000000;

  2938 	    w--;

  2939 	}

  2941 	while (w >= 16)

  2942 	{

  2943 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;

  2945 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);

  2946 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);

  2947 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);

  2948 	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);

  2950 	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));

  2951 	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));

  2952 	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));

  2953 	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));

  2955 	    dst += 16;

  2956 	    src += 16;

  2957 	    w -= 16;

  2958 	}

  2960 	while (w)

  2961 	{

  2962 	    *dst++ = *src++ | 0xff000000;

  2963 	    w--;

  2964 	}

  2965     }

  2967 }

  2969 static void

  2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,

  2971                                  pixman_composite_info_t *info)

  2972 {

  2973     PIXMAN_COMPOSITE_ARGS (info);

  2974     uint32_t    *dst_line, *dst;

  2975     uint32_t    *src_line, *src;

  2976     uint32_t mask;

  2977     int dst_stride, src_stride;

  2978     int32_t w;

  2980     __m128i xmm_mask, xmm_alpha;

  2981     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  2982     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  2984     PIXMAN_IMAGE_GET_LINE (

  2985 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  2986     PIXMAN_IMAGE_GET_LINE (

  2987 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  2989     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);

  2991     xmm_mask = create_mask_16_128 (mask >> 24);

  2992     xmm_alpha = mask_00ff;

  2994     while (height--)

  2995     {

  2996 	dst = dst_line;

  2997 	dst_line += dst_stride;

  2998 	src = src_line;

  2999 	src_line += src_stride;

  3000 	w = width;

  3002 	while (w && (uintptr_t)dst & 15)

  3003 	{

  3004 	    uint32_t s = (*src++) | 0xff000000;

  3005 	    uint32_t d = *dst;

  3007 	    __m128i src   = unpack_32_1x128 (s);

  3008 	    __m128i alpha = xmm_alpha;

  3009 	    __m128i mask  = xmm_mask;

  3010 	    __m128i dest  = unpack_32_1x128 (d);

  3012 	    *dst++ = pack_1x128_32 (

  3013 		in_over_1x128 (&src, &alpha, &mask, &dest));

  3015 	    w--;

  3016 	}

  3018 	while (w >= 4)

  3019 	{

  3020 	    xmm_src = _mm_or_si128 (

  3021 		load_128_unaligned ((__m128i*)src), mask_ff000000);

  3022 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  3024 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  3025 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  3027 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,

  3028 			   &xmm_alpha, &xmm_alpha,

  3029 			   &xmm_mask, &xmm_mask,

  3030 			   &xmm_dst_lo, &xmm_dst_hi);

  3032 	    save_128_aligned (

  3033 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  3035 	    dst += 4;

  3036 	    src += 4;

  3037 	    w -= 4;

  3039 	}

  3041 	while (w)

  3042 	{

  3043 	    uint32_t s = (*src++) | 0xff000000;

  3044 	    uint32_t d = *dst;

  3046 	    __m128i src  = unpack_32_1x128 (s);

  3047 	    __m128i alpha = xmm_alpha;

  3048 	    __m128i mask  = xmm_mask;

  3049 	    __m128i dest  = unpack_32_1x128 (d);

  3051 	    *dst++ = pack_1x128_32 (

  3052 		in_over_1x128 (&src, &alpha, &mask, &dest));

  3054 	    w--;

  3055 	}

  3056     }

  3058 }

  3060 static void

  3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,

  3062                                pixman_composite_info_t *info)

  3063 {

  3064     PIXMAN_COMPOSITE_ARGS (info);

  3065     int dst_stride, src_stride;

  3066     uint32_t    *dst_line, *dst;

  3067     uint32_t    *src_line, *src;

  3069     PIXMAN_IMAGE_GET_LINE (

  3070 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  3071     PIXMAN_IMAGE_GET_LINE (

  3072 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  3074     dst = dst_line;

  3075     src = src_line;

  3077     while (height--)

  3078     {

  3079 	sse2_combine_over_u (imp, op, dst, src, NULL, width);

  3081 	dst += dst_stride;

  3082 	src += src_stride;

  3083     }

  3084 }

  3086 static force_inline uint16_t

  3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)

  3088 {

  3089     __m128i ms;

  3091     ms = unpack_32_1x128 (src);

  3092     return pack_565_32_16 (

  3093 	pack_1x128_32 (

  3094 	    over_1x128 (

  3095 		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));

  3096 }

  3098 static void

  3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,

  3100                                pixman_composite_info_t *info)

  3101 {

  3102     PIXMAN_COMPOSITE_ARGS (info);

  3103     uint16_t    *dst_line, *dst, d;

  3104     uint32_t    *src_line, *src, s;

  3105     int dst_stride, src_stride;

  3106     int32_t w;

  3108     __m128i xmm_alpha_lo, xmm_alpha_hi;

  3109     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  3110     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;

  3112     PIXMAN_IMAGE_GET_LINE (

  3113 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);

  3114     PIXMAN_IMAGE_GET_LINE (

  3115 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  3117     while (height--)

  3118     {

  3119 	dst = dst_line;

  3120 	src = src_line;

  3122 	dst_line += dst_stride;

  3123 	src_line += src_stride;

  3124 	w = width;

  3126 	/* Align dst on a 16-byte boundary */

  3127 	while (w &&

  3128 	       ((uintptr_t)dst & 15))

  3129 	{

  3130 	    s = *src++;

  3131 	    d = *dst;

  3133 	    *dst++ = composite_over_8888_0565pixel (s, d);

  3134 	    w--;

  3135 	}

  3137 	/* It's a 8 pixel loop */

  3138 	while (w >= 8)

  3139 	{

  3140 	    /* I'm loading unaligned because I'm not sure

  3141 	     * about the address alignment.

  3142 	     */

  3143 	    xmm_src = load_128_unaligned ((__m128i*) src);

  3144 	    xmm_dst = load_128_aligned ((__m128i*) dst);

  3146 	    /* Unpacking */

  3147 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  3148 	    unpack_565_128_4x128 (xmm_dst,

  3149 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);

  3150 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  3151 				&xmm_alpha_lo, &xmm_alpha_hi);

  3153 	    /* I'm loading next 4 pixels from memory

  3154 	     * before to optimze the memory read.

  3155 	     */

  3156 	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));

  3158 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,

  3159 			&xmm_alpha_lo, &xmm_alpha_hi,

  3160 			&xmm_dst0, &xmm_dst1);

  3162 	    /* Unpacking */

  3163 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  3164 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  3165 				&xmm_alpha_lo, &xmm_alpha_hi);

  3167 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,

  3168 			&xmm_alpha_lo, &xmm_alpha_hi,

  3169 			&xmm_dst2, &xmm_dst3);

  3171 	    save_128_aligned (

  3172 		(__m128i*)dst, pack_565_4x128_128 (

  3173 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));

  3175 	    w -= 8;

  3176 	    dst += 8;

  3177 	    src += 8;

  3178 	}

  3180 	while (w--)

  3181 	{

  3182 	    s = *src++;

  3183 	    d = *dst;

  3185 	    *dst++ = composite_over_8888_0565pixel (s, d);

  3186 	}

  3187     }

  3189 }

  3191 static void

  3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,

  3193                               pixman_composite_info_t *info)

  3194 {

  3195     PIXMAN_COMPOSITE_ARGS (info);

  3196     uint32_t src, srca;

  3197     uint32_t *dst_line, *dst;

  3198     uint8_t *mask_line, *mask;

  3199     int dst_stride, mask_stride;

  3200     int32_t w;

  3201     uint32_t m, d;

  3203     __m128i xmm_src, xmm_alpha, xmm_def;

  3204     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  3205     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  3207     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;

  3209     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  3211     srca = src >> 24;

  3212     if (src == 0)

  3213 	return;

  3215     PIXMAN_IMAGE_GET_LINE (

  3216 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  3217     PIXMAN_IMAGE_GET_LINE (

  3218 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  3220     xmm_def = create_mask_2x32_128 (src, src);

  3221     xmm_src = expand_pixel_32_1x128 (src);

  3222     xmm_alpha = expand_alpha_1x128 (xmm_src);

  3223     mmx_src   = xmm_src;

  3224     mmx_alpha = xmm_alpha;

  3226     while (height--)

  3227     {

  3228 	dst = dst_line;

  3229 	dst_line += dst_stride;

  3230 	mask = mask_line;

  3231 	mask_line += mask_stride;

  3232 	w = width;

  3234 	while (w && (uintptr_t)dst & 15)

  3235 	{

  3236 	    uint8_t m = *mask++;

  3238 	    if (m)

  3239 	    {

  3240 		d = *dst;

  3241 		mmx_mask = expand_pixel_8_1x128 (m);

  3242 		mmx_dest = unpack_32_1x128 (d);

  3244 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,

  3245 		                                   &mmx_alpha,

  3246 		                                   &mmx_mask,

  3247 		                                   &mmx_dest));

  3248 	    }

  3250 	    w--;

  3251 	    dst++;

  3252 	}

  3254 	while (w >= 4)

  3255 	{

  3256 	    m = *((uint32_t*)mask);

  3258 	    if (srca == 0xff && m == 0xffffffff)

  3259 	    {

  3260 		save_128_aligned ((__m128i*)dst, xmm_def);

  3261 	    }

  3262 	    else if (m)

  3263 	    {

  3264 		xmm_dst = load_128_aligned ((__m128i*) dst);

  3265 		xmm_mask = unpack_32_1x128 (m);

  3266 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());

  3268 		/* Unpacking */

  3269 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  3270 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  3272 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,

  3273 					&xmm_mask_lo, &xmm_mask_hi);

  3275 		in_over_2x128 (&xmm_src, &xmm_src,

  3276 			       &xmm_alpha, &xmm_alpha,

  3277 			       &xmm_mask_lo, &xmm_mask_hi,

  3278 			       &xmm_dst_lo, &xmm_dst_hi);

  3280 		save_128_aligned (

  3281 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  3282 	    }

  3284 	    w -= 4;

  3285 	    dst += 4;

  3286 	    mask += 4;

  3287 	}

  3289 	while (w)

  3290 	{

  3291 	    uint8_t m = *mask++;

  3293 	    if (m)

  3294 	    {

  3295 		d = *dst;

  3296 		mmx_mask = expand_pixel_8_1x128 (m);

  3297 		mmx_dest = unpack_32_1x128 (d);

  3299 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,

  3300 		                                   &mmx_alpha,

  3301 		                                   &mmx_mask,

  3302 		                                   &mmx_dest));

  3303 	    }

  3305 	    w--;

  3306 	    dst++;

  3307 	}

  3308     }

  3310 }

  3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)

  3313 __attribute__((__force_align_arg_pointer__))

  3314 #endif

  3315 static pixman_bool_t

  3316 sse2_fill (pixman_implementation_t *imp,

  3317            uint32_t *               bits,

  3318            int                      stride,

  3319            int                      bpp,

  3320            int                      x,

  3321            int                      y,

  3322            int                      width,

  3323            int                      height,

  3324            uint32_t		    filler)

  3325 {

  3326     uint32_t byte_width;

  3327     uint8_t *byte_line;

  3329     __m128i xmm_def;

  3331     if (bpp == 8)

  3332     {

  3333 	uint8_t b;

  3334 	uint16_t w;

  3336 	stride = stride * (int) sizeof (uint32_t) / 1;

  3337 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);

  3338 	byte_width = width;

  3339 	stride *= 1;

  3341 	b = filler & 0xff;

  3342 	w = (b << 8) | b;

  3343 	filler = (w << 16) | w;

  3344     }

  3345     else if (bpp == 16)

  3346     {

  3347 	stride = stride * (int) sizeof (uint32_t) / 2;

  3348 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);

  3349 	byte_width = 2 * width;

  3350 	stride *= 2;

  3352         filler = (filler & 0xffff) * 0x00010001;

  3353     }

  3354     else if (bpp == 32)

  3355     {

  3356 	stride = stride * (int) sizeof (uint32_t) / 4;

  3357 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);

  3358 	byte_width = 4 * width;

  3359 	stride *= 4;

  3360     }

  3361     else

  3362     {

  3363 	return FALSE;

  3364     }

  3366     xmm_def = create_mask_2x32_128 (filler, filler);

  3368     while (height--)

  3369     {

  3370 	int w;

  3371 	uint8_t *d = byte_line;

  3372 	byte_line += stride;

  3373 	w = byte_width;

  3375 	if (w >= 1 && ((uintptr_t)d & 1))

  3376 	{

  3377 	    *(uint8_t *)d = filler;

  3378 	    w -= 1;

  3379 	    d += 1;

  3380 	}

  3382 	while (w >= 2 && ((uintptr_t)d & 3))

  3383 	{

  3384 	    *(uint16_t *)d = filler;

  3385 	    w -= 2;

  3386 	    d += 2;

  3387 	}

  3389 	while (w >= 4 && ((uintptr_t)d & 15))

  3390 	{

  3391 	    *(uint32_t *)d = filler;

  3393 	    w -= 4;

  3394 	    d += 4;

  3395 	}

  3397 	while (w >= 128)

  3398 	{

  3399 	    save_128_aligned ((__m128i*)(d),     xmm_def);

  3400 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);

  3401 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);

  3402 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);

  3403 	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);

  3404 	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);

  3405 	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);

  3406 	    save_128_aligned ((__m128i*)(d + 112), xmm_def);

  3408 	    d += 128;

  3409 	    w -= 128;

  3410 	}

  3412 	if (w >= 64)

  3413 	{

  3414 	    save_128_aligned ((__m128i*)(d),     xmm_def);

  3415 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);

  3416 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);

  3417 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);

  3419 	    d += 64;

  3420 	    w -= 64;

  3421 	}

  3423 	if (w >= 32)

  3424 	{

  3425 	    save_128_aligned ((__m128i*)(d),     xmm_def);

  3426 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);

  3428 	    d += 32;

  3429 	    w -= 32;

  3430 	}

  3432 	if (w >= 16)

  3433 	{

  3434 	    save_128_aligned ((__m128i*)(d),     xmm_def);

  3436 	    d += 16;

  3437 	    w -= 16;

  3438 	}

  3440 	while (w >= 4)

  3441 	{

  3442 	    *(uint32_t *)d = filler;

  3444 	    w -= 4;

  3445 	    d += 4;

  3446 	}

  3448 	if (w >= 2)

  3449 	{

  3450 	    *(uint16_t *)d = filler;

  3451 	    w -= 2;

  3452 	    d += 2;

  3453 	}

  3455 	if (w >= 1)

  3456 	{

  3457 	    *(uint8_t *)d = filler;

  3458 	    w -= 1;

  3459 	    d += 1;

  3460 	}

  3461     }

  3463     return TRUE;

  3464 }

  3466 static void

  3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,

  3468                              pixman_composite_info_t *info)

  3469 {

  3470     PIXMAN_COMPOSITE_ARGS (info);

  3471     uint32_t src, srca;

  3472     uint32_t    *dst_line, *dst;

  3473     uint8_t     *mask_line, *mask;

  3474     int dst_stride, mask_stride;

  3475     int32_t w;

  3476     uint32_t m;

  3478     __m128i xmm_src, xmm_def;

  3479     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  3481     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  3483     srca = src >> 24;

  3484     if (src == 0)

  3485     {

  3486 	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,

  3487 		   PIXMAN_FORMAT_BPP (dest_image->bits.format),

  3488 		   dest_x, dest_y, width, height, 0);

  3489 	return;

  3490     }

  3492     PIXMAN_IMAGE_GET_LINE (

  3493 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  3494     PIXMAN_IMAGE_GET_LINE (

  3495 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  3497     xmm_def = create_mask_2x32_128 (src, src);

  3498     xmm_src = expand_pixel_32_1x128 (src);

  3500     while (height--)

  3501     {

  3502 	dst = dst_line;

  3503 	dst_line += dst_stride;

  3504 	mask = mask_line;

  3505 	mask_line += mask_stride;

  3506 	w = width;

  3508 	while (w && (uintptr_t)dst & 15)

  3509 	{

  3510 	    uint8_t m = *mask++;

  3512 	    if (m)

  3513 	    {

  3514 		*dst = pack_1x128_32 (

  3515 		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));

  3516 	    }

  3517 	    else

  3518 	    {

  3519 		*dst = 0;

  3520 	    }

  3522 	    w--;

  3523 	    dst++;

  3524 	}

  3526 	while (w >= 4)

  3527 	{

  3528 	    m = *((uint32_t*)mask);

  3530 	    if (srca == 0xff && m == 0xffffffff)

  3531 	    {

  3532 		save_128_aligned ((__m128i*)dst, xmm_def);

  3533 	    }

  3534 	    else if (m)

  3535 	    {

  3536 		xmm_mask = unpack_32_1x128 (m);

  3537 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());

  3539 		/* Unpacking */

  3540 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  3542 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,

  3543 					&xmm_mask_lo, &xmm_mask_hi);

  3545 		pix_multiply_2x128 (&xmm_src, &xmm_src,

  3546 				    &xmm_mask_lo, &xmm_mask_hi,

  3547 				    &xmm_mask_lo, &xmm_mask_hi);

  3549 		save_128_aligned (

  3550 		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));

  3551 	    }

  3552 	    else

  3553 	    {

  3554 		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());

  3555 	    }

  3557 	    w -= 4;

  3558 	    dst += 4;

  3559 	    mask += 4;

  3560 	}

  3562 	while (w)

  3563 	{

  3564 	    uint8_t m = *mask++;

  3566 	    if (m)

  3567 	    {

  3568 		*dst = pack_1x128_32 (

  3569 		    pix_multiply_1x128 (

  3570 			xmm_src, expand_pixel_8_1x128 (m)));

  3571 	    }

  3572 	    else

  3573 	    {

  3574 		*dst = 0;

  3575 	    }

  3577 	    w--;

  3578 	    dst++;

  3579 	}

  3580     }

  3582 }

  3584 static void

  3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,

  3586                               pixman_composite_info_t *info)

  3587 {

  3588     PIXMAN_COMPOSITE_ARGS (info);

  3589     uint32_t src;

  3590     uint16_t    *dst_line, *dst, d;

  3591     uint8_t     *mask_line, *mask;

  3592     int dst_stride, mask_stride;

  3593     int32_t w;

  3594     uint32_t m;

  3595     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;

  3597     __m128i xmm_src, xmm_alpha;

  3598     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  3599     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;

  3601     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  3603     if (src == 0)

  3604 	return;

  3606     PIXMAN_IMAGE_GET_LINE (

  3607 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);

  3608     PIXMAN_IMAGE_GET_LINE (

  3609 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  3611     xmm_src = expand_pixel_32_1x128 (src);

  3612     xmm_alpha = expand_alpha_1x128 (xmm_src);

  3613     mmx_src = xmm_src;

  3614     mmx_alpha = xmm_alpha;

  3616     while (height--)

  3617     {

  3618 	dst = dst_line;

  3619 	dst_line += dst_stride;

  3620 	mask = mask_line;

  3621 	mask_line += mask_stride;

  3622 	w = width;

  3624 	while (w && (uintptr_t)dst & 15)

  3625 	{

  3626 	    m = *mask++;

  3628 	    if (m)

  3629 	    {

  3630 		d = *dst;

  3631 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));

  3632 		mmx_dest = expand565_16_1x128 (d);

  3634 		*dst = pack_565_32_16 (

  3635 		    pack_1x128_32 (

  3636 			in_over_1x128 (

  3637 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));

  3638 	    }

  3640 	    w--;

  3641 	    dst++;

  3642 	}

  3644 	while (w >= 8)

  3645 	{

  3646 	    xmm_dst = load_128_aligned ((__m128i*) dst);

  3647 	    unpack_565_128_4x128 (xmm_dst,

  3648 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);

  3650 	    m = *((uint32_t*)mask);

  3651 	    mask += 4;

  3653 	    if (m)

  3654 	    {

  3655 		xmm_mask = unpack_32_1x128 (m);

  3656 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());

  3658 		/* Unpacking */

  3659 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  3661 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,

  3662 					&xmm_mask_lo, &xmm_mask_hi);

  3664 		in_over_2x128 (&xmm_src, &xmm_src,

  3665 			       &xmm_alpha, &xmm_alpha,

  3666 			       &xmm_mask_lo, &xmm_mask_hi,

  3667 			       &xmm_dst0, &xmm_dst1);

  3668 	    }

  3670 	    m = *((uint32_t*)mask);

  3671 	    mask += 4;

  3673 	    if (m)

  3674 	    {

  3675 		xmm_mask = unpack_32_1x128 (m);

  3676 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());

  3678 		/* Unpacking */

  3679 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  3681 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,

  3682 					&xmm_mask_lo, &xmm_mask_hi);

  3683 		in_over_2x128 (&xmm_src, &xmm_src,

  3684 			       &xmm_alpha, &xmm_alpha,

  3685 			       &xmm_mask_lo, &xmm_mask_hi,

  3686 			       &xmm_dst2, &xmm_dst3);

  3687 	    }

  3689 	    save_128_aligned (

  3690 		(__m128i*)dst, pack_565_4x128_128 (

  3691 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));

  3693 	    w -= 8;

  3694 	    dst += 8;

  3695 	}

  3697 	while (w)

  3698 	{

  3699 	    m = *mask++;

  3701 	    if (m)

  3702 	    {

  3703 		d = *dst;

  3704 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));

  3705 		mmx_dest = expand565_16_1x128 (d);

  3707 		*dst = pack_565_32_16 (

  3708 		    pack_1x128_32 (

  3709 			in_over_1x128 (

  3710 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));

  3711 	    }

  3713 	    w--;

  3714 	    dst++;

  3715 	}

  3716     }

  3718 }

  3720 static void

  3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,

  3722                                  pixman_composite_info_t *info)

  3723 {

  3724     PIXMAN_COMPOSITE_ARGS (info);

  3725     uint16_t    *dst_line, *dst, d;

  3726     uint32_t    *src_line, *src, s;

  3727     int dst_stride, src_stride;

  3728     int32_t w;

  3729     uint32_t opaque, zero;

  3731     __m128i ms;

  3732     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  3733     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;

  3735     PIXMAN_IMAGE_GET_LINE (

  3736 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);

  3737     PIXMAN_IMAGE_GET_LINE (

  3738 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  3740     while (height--)

  3741     {

  3742 	dst = dst_line;

  3743 	dst_line += dst_stride;

  3744 	src = src_line;

  3745 	src_line += src_stride;

  3746 	w = width;

  3748 	while (w && (uintptr_t)dst & 15)

  3749 	{

  3750 	    s = *src++;

  3751 	    d = *dst;

  3753 	    ms = unpack_32_1x128 (s);

  3755 	    *dst++ = pack_565_32_16 (

  3756 		pack_1x128_32 (

  3757 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));

  3758 	    w--;

  3759 	}

  3761 	while (w >= 8)

  3762 	{

  3763 	    /* First round */

  3764 	    xmm_src = load_128_unaligned ((__m128i*)src);

  3765 	    xmm_dst = load_128_aligned  ((__m128i*)dst);

  3767 	    opaque = is_opaque (xmm_src);

  3768 	    zero = is_zero (xmm_src);

  3770 	    unpack_565_128_4x128 (xmm_dst,

  3771 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);

  3772 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  3774 	    /* preload next round*/

  3775 	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));

  3777 	    if (opaque)

  3778 	    {

  3779 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,

  3780 				     &xmm_dst0, &xmm_dst1);

  3781 	    }

  3782 	    else if (!zero)

  3783 	    {

  3784 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,

  3785 					&xmm_dst0, &xmm_dst1);

  3786 	    }

  3788 	    /* Second round */

  3789 	    opaque = is_opaque (xmm_src);

  3790 	    zero = is_zero (xmm_src);

  3792 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  3794 	    if (opaque)

  3795 	    {

  3796 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,

  3797 				     &xmm_dst2, &xmm_dst3);

  3798 	    }

  3799 	    else if (!zero)

  3800 	    {

  3801 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,

  3802 					&xmm_dst2, &xmm_dst3);

  3803 	    }

  3805 	    save_128_aligned (

  3806 		(__m128i*)dst, pack_565_4x128_128 (

  3807 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));

  3809 	    w -= 8;

  3810 	    src += 8;

  3811 	    dst += 8;

  3812 	}

  3814 	while (w)

  3815 	{

  3816 	    s = *src++;

  3817 	    d = *dst;

  3819 	    ms = unpack_32_1x128 (s);

  3821 	    *dst++ = pack_565_32_16 (

  3822 		pack_1x128_32 (

  3823 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));

  3824 	    w--;

  3825 	}

  3826     }

  3828 }

  3830 static void

  3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,

  3832                                  pixman_composite_info_t *info)

  3833 {

  3834     PIXMAN_COMPOSITE_ARGS (info);

  3835     uint32_t    *dst_line, *dst, d;

  3836     uint32_t    *src_line, *src, s;

  3837     int dst_stride, src_stride;

  3838     int32_t w;

  3839     uint32_t opaque, zero;

  3841     __m128i xmm_src_lo, xmm_src_hi;

  3842     __m128i xmm_dst_lo, xmm_dst_hi;

  3844     PIXMAN_IMAGE_GET_LINE (

  3845 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  3846     PIXMAN_IMAGE_GET_LINE (

  3847 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  3849     while (height--)

  3850     {

  3851 	dst = dst_line;

  3852 	dst_line += dst_stride;

  3853 	src = src_line;

  3854 	src_line += src_stride;

  3855 	w = width;

  3857 	while (w && (uintptr_t)dst & 15)

  3858 	{

  3859 	    s = *src++;

  3860 	    d = *dst;

  3862 	    *dst++ = pack_1x128_32 (

  3863 		over_rev_non_pre_1x128 (

  3864 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));

  3866 	    w--;

  3867 	}

  3869 	while (w >= 4)

  3870 	{

  3871 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);

  3873 	    opaque = is_opaque (xmm_src_hi);

  3874 	    zero = is_zero (xmm_src_hi);

  3876 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  3878 	    if (opaque)

  3879 	    {

  3880 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,

  3881 				     &xmm_dst_lo, &xmm_dst_hi);

  3883 		save_128_aligned (

  3884 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  3885 	    }

  3886 	    else if (!zero)

  3887 	    {

  3888 		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);

  3890 		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  3892 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,

  3893 					&xmm_dst_lo, &xmm_dst_hi);

  3895 		save_128_aligned (

  3896 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  3897 	    }

  3899 	    w -= 4;

  3900 	    dst += 4;

  3901 	    src += 4;

  3902 	}

  3904 	while (w)

  3905 	{

  3906 	    s = *src++;

  3907 	    d = *dst;

  3909 	    *dst++ = pack_1x128_32 (

  3910 		over_rev_non_pre_1x128 (

  3911 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));

  3913 	    w--;

  3914 	}

  3915     }

  3917 }

  3919 static void

  3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,

  3921                                     pixman_composite_info_t *info)

  3922 {

  3923     PIXMAN_COMPOSITE_ARGS (info);

  3924     uint32_t src;

  3925     uint16_t    *dst_line, *dst, d;

  3926     uint32_t    *mask_line, *mask, m;

  3927     int dst_stride, mask_stride;

  3928     int w;

  3929     uint32_t pack_cmp;

  3931     __m128i xmm_src, xmm_alpha;

  3932     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  3933     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;

  3935     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;

  3937     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  3939     if (src == 0)

  3940 	return;

  3942     PIXMAN_IMAGE_GET_LINE (

  3943 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);

  3944     PIXMAN_IMAGE_GET_LINE (

  3945 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);

  3947     xmm_src = expand_pixel_32_1x128 (src);

  3948     xmm_alpha = expand_alpha_1x128 (xmm_src);

  3949     mmx_src = xmm_src;

  3950     mmx_alpha = xmm_alpha;

  3952     while (height--)

  3953     {

  3954 	w = width;

  3955 	mask = mask_line;

  3956 	dst = dst_line;

  3957 	mask_line += mask_stride;

  3958 	dst_line += dst_stride;

  3960 	while (w && ((uintptr_t)dst & 15))

  3961 	{

  3962 	    m = *(uint32_t *) mask;

  3964 	    if (m)

  3965 	    {

  3966 		d = *dst;

  3967 		mmx_mask = unpack_32_1x128 (m);

  3968 		mmx_dest = expand565_16_1x128 (d);

  3970 		*dst = pack_565_32_16 (

  3971 		    pack_1x128_32 (

  3972 			in_over_1x128 (

  3973 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));

  3974 	    }

  3976 	    w--;

  3977 	    dst++;

  3978 	    mask++;

  3979 	}

  3981 	while (w >= 8)

  3982 	{

  3983 	    /* First round */

  3984 	    xmm_mask = load_128_unaligned ((__m128i*)mask);

  3985 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  3987 	    pack_cmp = _mm_movemask_epi8 (

  3988 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));

  3990 	    unpack_565_128_4x128 (xmm_dst,

  3991 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);

  3992 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  3994 	    /* preload next round */

  3995 	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));

  3997 	    /* preload next round */

  3998 	    if (pack_cmp != 0xffff)

  3999 	    {

  4000 		in_over_2x128 (&xmm_src, &xmm_src,

  4001 			       &xmm_alpha, &xmm_alpha,

  4002 			       &xmm_mask_lo, &xmm_mask_hi,

  4003 			       &xmm_dst0, &xmm_dst1);

  4004 	    }

  4006 	    /* Second round */

  4007 	    pack_cmp = _mm_movemask_epi8 (

  4008 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));

  4010 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  4012 	    if (pack_cmp != 0xffff)

  4013 	    {

  4014 		in_over_2x128 (&xmm_src, &xmm_src,

  4015 			       &xmm_alpha, &xmm_alpha,

  4016 			       &xmm_mask_lo, &xmm_mask_hi,

  4017 			       &xmm_dst2, &xmm_dst3);

  4018 	    }

  4020 	    save_128_aligned (

  4021 		(__m128i*)dst, pack_565_4x128_128 (

  4022 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));

  4024 	    w -= 8;

  4025 	    dst += 8;

  4026 	    mask += 8;

  4027 	}

  4029 	while (w)

  4030 	{

  4031 	    m = *(uint32_t *) mask;

  4033 	    if (m)

  4034 	    {

  4035 		d = *dst;

  4036 		mmx_mask = unpack_32_1x128 (m);

  4037 		mmx_dest = expand565_16_1x128 (d);

  4039 		*dst = pack_565_32_16 (

  4040 		    pack_1x128_32 (

  4041 			in_over_1x128 (

  4042 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));

  4043 	    }

  4045 	    w--;

  4046 	    dst++;

  4047 	    mask++;

  4048 	}

  4049     }

  4051 }

  4053 static void

  4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,

  4055                          pixman_composite_info_t *info)

  4056 {

  4057     PIXMAN_COMPOSITE_ARGS (info);

  4058     uint8_t     *dst_line, *dst;

  4059     uint8_t     *mask_line, *mask;

  4060     int dst_stride, mask_stride;

  4061     uint32_t d, m;

  4062     uint32_t src;

  4063     int32_t w;

  4065     __m128i xmm_alpha;

  4066     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  4067     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  4069     PIXMAN_IMAGE_GET_LINE (

  4070 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

  4071     PIXMAN_IMAGE_GET_LINE (

  4072 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  4074     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  4076     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));

  4078     while (height--)

  4079     {

  4080 	dst = dst_line;

  4081 	dst_line += dst_stride;

  4082 	mask = mask_line;

  4083 	mask_line += mask_stride;

  4084 	w = width;

  4086 	while (w && ((uintptr_t)dst & 15))

  4087 	{

  4088 	    m = (uint32_t) *mask++;

  4089 	    d = (uint32_t) *dst;

  4091 	    *dst++ = (uint8_t) pack_1x128_32 (

  4092 		pix_multiply_1x128 (

  4093 		    pix_multiply_1x128 (xmm_alpha,

  4094 				       unpack_32_1x128 (m)),

  4095 		    unpack_32_1x128 (d)));

  4096 	    w--;

  4097 	}

  4099 	while (w >= 16)

  4100 	{

  4101 	    xmm_mask = load_128_unaligned ((__m128i*)mask);

  4102 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  4104 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  4105 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  4107 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,

  4108 				&xmm_mask_lo, &xmm_mask_hi,

  4109 				&xmm_mask_lo, &xmm_mask_hi);

  4111 	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,

  4112 				&xmm_dst_lo, &xmm_dst_hi,

  4113 				&xmm_dst_lo, &xmm_dst_hi);

  4115 	    save_128_aligned (

  4116 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  4118 	    mask += 16;

  4119 	    dst += 16;

  4120 	    w -= 16;

  4121 	}

  4123 	while (w)

  4124 	{

  4125 	    m = (uint32_t) *mask++;

  4126 	    d = (uint32_t) *dst;

  4128 	    *dst++ = (uint8_t) pack_1x128_32 (

  4129 		pix_multiply_1x128 (

  4130 		    pix_multiply_1x128 (

  4131 			xmm_alpha, unpack_32_1x128 (m)),

  4132 		    unpack_32_1x128 (d)));

  4133 	    w--;

  4134 	}

  4135     }

  4137 }

  4139 static void

  4140 sse2_composite_in_n_8 (pixman_implementation_t *imp,

  4141 		       pixman_composite_info_t *info)

  4142 {

  4143     PIXMAN_COMPOSITE_ARGS (info);

  4144     uint8_t     *dst_line, *dst;

  4145     int dst_stride;

  4146     uint32_t d;

  4147     uint32_t src;

  4148     int32_t w;

  4150     __m128i xmm_alpha;

  4151     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  4153     PIXMAN_IMAGE_GET_LINE (

  4154 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

  4156     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  4158     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));

  4160     src = src >> 24;

  4162     if (src == 0xff)

  4163 	return;

  4165     if (src == 0x00)

  4166     {

  4167 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,

  4168 		     8, dest_x, dest_y, width, height, src);

  4170 	return;

  4171     }

  4173     while (height--)

  4174     {

  4175 	dst = dst_line;

  4176 	dst_line += dst_stride;

  4177 	w = width;

  4179 	while (w && ((uintptr_t)dst & 15))

  4180 	{

  4181 	    d = (uint32_t) *dst;

  4183 	    *dst++ = (uint8_t) pack_1x128_32 (

  4184 		pix_multiply_1x128 (

  4185 		    xmm_alpha,

  4186 		    unpack_32_1x128 (d)));

  4187 	    w--;

  4188 	}

  4190 	while (w >= 16)

  4191 	{

  4192 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  4194 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  4196 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,

  4197 				&xmm_dst_lo, &xmm_dst_hi,

  4198 				&xmm_dst_lo, &xmm_dst_hi);

  4200 	    save_128_aligned (

  4201 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  4203 	    dst += 16;

  4204 	    w -= 16;

  4205 	}

  4207 	while (w)

  4208 	{

  4209 	    d = (uint32_t) *dst;

  4211 	    *dst++ = (uint8_t) pack_1x128_32 (

  4212 		pix_multiply_1x128 (

  4213 		    xmm_alpha,

  4214 		    unpack_32_1x128 (d)));

  4215 	    w--;

  4216 	}

  4217     }

  4219 }

  4221 static void

  4222 sse2_composite_in_8_8 (pixman_implementation_t *imp,

  4223                        pixman_composite_info_t *info)

  4224 {

  4225     PIXMAN_COMPOSITE_ARGS (info);

  4226     uint8_t     *dst_line, *dst;

  4227     uint8_t     *src_line, *src;

  4228     int src_stride, dst_stride;

  4229     int32_t w;

  4230     uint32_t s, d;

  4232     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  4233     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  4235     PIXMAN_IMAGE_GET_LINE (

  4236 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

  4237     PIXMAN_IMAGE_GET_LINE (

  4238 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);

  4240     while (height--)

  4241     {

  4242 	dst = dst_line;

  4243 	dst_line += dst_stride;

  4244 	src = src_line;

  4245 	src_line += src_stride;

  4246 	w = width;

  4248 	while (w && ((uintptr_t)dst & 15))

  4249 	{

  4250 	    s = (uint32_t) *src++;

  4251 	    d = (uint32_t) *dst;

  4253 	    *dst++ = (uint8_t) pack_1x128_32 (

  4254 		pix_multiply_1x128 (

  4255 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));

  4256 	    w--;

  4257 	}

  4259 	while (w >= 16)

  4260 	{

  4261 	    xmm_src = load_128_unaligned ((__m128i*)src);

  4262 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  4264 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  4265 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  4267 	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,

  4268 				&xmm_dst_lo, &xmm_dst_hi,

  4269 				&xmm_dst_lo, &xmm_dst_hi);

  4271 	    save_128_aligned (

  4272 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  4274 	    src += 16;

  4275 	    dst += 16;

  4276 	    w -= 16;

  4277 	}

  4279 	while (w)

  4280 	{

  4281 	    s = (uint32_t) *src++;

  4282 	    d = (uint32_t) *dst;

  4284 	    *dst++ = (uint8_t) pack_1x128_32 (

  4285 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));

  4286 	    w--;

  4287 	}

  4288     }

  4290 }

  4292 static void

  4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,

  4294 			  pixman_composite_info_t *info)

  4295 {

  4296     PIXMAN_COMPOSITE_ARGS (info);

  4297     uint8_t     *dst_line, *dst;

  4298     uint8_t     *mask_line, *mask;

  4299     int dst_stride, mask_stride;

  4300     int32_t w;

  4301     uint32_t src;

  4302     uint32_t m, d;

  4304     __m128i xmm_alpha;

  4305     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  4306     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  4308     PIXMAN_IMAGE_GET_LINE (

  4309 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

  4310     PIXMAN_IMAGE_GET_LINE (

  4311 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  4313     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  4315     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));

  4317     while (height--)

  4318     {

  4319 	dst = dst_line;

  4320 	dst_line += dst_stride;

  4321 	mask = mask_line;

  4322 	mask_line += mask_stride;

  4323 	w = width;

  4325 	while (w && ((uintptr_t)dst & 15))

  4326 	{

  4327 	    m = (uint32_t) *mask++;

  4328 	    d = (uint32_t) *dst;

  4330 	    *dst++ = (uint8_t) pack_1x128_32 (

  4331 		_mm_adds_epu16 (

  4332 		    pix_multiply_1x128 (

  4333 			xmm_alpha, unpack_32_1x128 (m)),

  4334 		    unpack_32_1x128 (d)));

  4335 	    w--;

  4336 	}

  4338 	while (w >= 16)

  4339 	{

  4340 	    xmm_mask = load_128_unaligned ((__m128i*)mask);

  4341 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  4343 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  4344 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  4346 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,

  4347 				&xmm_mask_lo, &xmm_mask_hi,

  4348 				&xmm_mask_lo, &xmm_mask_hi);

  4350 	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);

  4351 	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);

  4353 	    save_128_aligned (

  4354 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  4356 	    mask += 16;

  4357 	    dst += 16;

  4358 	    w -= 16;

  4359 	}

  4361 	while (w)

  4362 	{

  4363 	    m = (uint32_t) *mask++;

  4364 	    d = (uint32_t) *dst;

  4366 	    *dst++ = (uint8_t) pack_1x128_32 (

  4367 		_mm_adds_epu16 (

  4368 		    pix_multiply_1x128 (

  4369 			xmm_alpha, unpack_32_1x128 (m)),

  4370 		    unpack_32_1x128 (d)));

  4372 	    w--;

  4373 	}

  4374     }

  4376 }

  4378 static void

  4379 sse2_composite_add_n_8 (pixman_implementation_t *imp,

  4380 			pixman_composite_info_t *info)

  4381 {

  4382     PIXMAN_COMPOSITE_ARGS (info);

  4383     uint8_t     *dst_line, *dst;

  4384     int dst_stride;

  4385     int32_t w;

  4386     uint32_t src;

  4388     __m128i xmm_src;

  4390     PIXMAN_IMAGE_GET_LINE (

  4391 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

  4393     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  4395     src >>= 24;

  4397     if (src == 0x00)

  4398 	return;

  4400     if (src == 0xff)

  4401     {

  4402 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,

  4403 		     8, dest_x, dest_y, width, height, 0xff);

  4405 	return;

  4406     }

  4408     src = (src << 24) | (src << 16) | (src << 8) | src;

  4409     xmm_src = _mm_set_epi32 (src, src, src, src);

  4411     while (height--)

  4412     {

  4413 	dst = dst_line;

  4414 	dst_line += dst_stride;

  4415 	w = width;

  4417 	while (w && ((uintptr_t)dst & 15))

  4418 	{

  4419 	    *dst = (uint8_t)_mm_cvtsi128_si32 (

  4420 		_mm_adds_epu8 (

  4421 		    xmm_src,

  4422 		    _mm_cvtsi32_si128 (*dst)));

  4424 	    w--;

  4425 	    dst++;

  4426 	}

  4428 	while (w >= 16)

  4429 	{

  4430 	    save_128_aligned (

  4431 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));

  4433 	    dst += 16;

  4434 	    w -= 16;

  4435 	}

  4437 	while (w)

  4438 	{

  4439 	    *dst = (uint8_t)_mm_cvtsi128_si32 (

  4440 		_mm_adds_epu8 (

  4441 		    xmm_src,

  4442 		    _mm_cvtsi32_si128 (*dst)));

  4444 	    w--;

  4445 	    dst++;

  4446 	}

  4447     }

  4449 }

  4451 static void

  4452 sse2_composite_add_8_8 (pixman_implementation_t *imp,

  4453 			pixman_composite_info_t *info)

  4454 {

  4455     PIXMAN_COMPOSITE_ARGS (info);

  4456     uint8_t     *dst_line, *dst;

  4457     uint8_t     *src_line, *src;

  4458     int dst_stride, src_stride;

  4459     int32_t w;

  4460     uint16_t t;

  4462     PIXMAN_IMAGE_GET_LINE (

  4463 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);

  4464     PIXMAN_IMAGE_GET_LINE (

  4465 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);

  4467     while (height--)

  4468     {

  4469 	dst = dst_line;

  4470 	src = src_line;

  4472 	dst_line += dst_stride;

  4473 	src_line += src_stride;

  4474 	w = width;

  4476 	/* Small head */

  4477 	while (w && (uintptr_t)dst & 3)

  4478 	{

  4479 	    t = (*dst) + (*src++);

  4480 	    *dst++ = t | (0 - (t >> 8));

  4481 	    w--;

  4482 	}

  4484 	sse2_combine_add_u (imp, op,

  4485 			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);

  4487 	/* Small tail */

  4488 	dst += w & 0xfffc;

  4489 	src += w & 0xfffc;

  4491 	w &= 3;

  4493 	while (w)

  4494 	{

  4495 	    t = (*dst) + (*src++);

  4496 	    *dst++ = t | (0 - (t >> 8));

  4497 	    w--;

  4498 	}

  4499     }

  4501 }

  4503 static void

  4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,

  4505                               pixman_composite_info_t *info)

  4506 {

  4507     PIXMAN_COMPOSITE_ARGS (info);

  4508     uint32_t    *dst_line, *dst;

  4509     uint32_t    *src_line, *src;

  4510     int dst_stride, src_stride;

  4512     PIXMAN_IMAGE_GET_LINE (

  4513 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  4514     PIXMAN_IMAGE_GET_LINE (

  4515 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  4517     while (height--)

  4518     {

  4519 	dst = dst_line;

  4520 	dst_line += dst_stride;

  4521 	src = src_line;

  4522 	src_line += src_stride;

  4524 	sse2_combine_add_u (imp, op, dst, src, NULL, width);

  4525     }

  4526 }

  4528 static void

  4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp,

  4530 			   pixman_composite_info_t *info)

  4531 {

  4532     PIXMAN_COMPOSITE_ARGS (info);

  4533     uint32_t *dst_line, *dst, src;

  4534     int dst_stride;

  4536     __m128i xmm_src;

  4538     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  4540     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  4541     if (src == 0)

  4542 	return;

  4544     if (src == ~0)

  4545     {

  4546 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,

  4547 		     dest_x, dest_y, width, height, ~0);

  4549 	return;

  4550     }

  4552     xmm_src = _mm_set_epi32 (src, src, src, src);

  4553     while (height--)

  4554     {

  4555 	int w = width;

  4556 	uint32_t d;

  4558 	dst = dst_line;

  4559 	dst_line += dst_stride;

  4561 	while (w && (unsigned long)dst & 15)

  4562 	{

  4563 	    d = *dst;

  4564 	    *dst++ =

  4565 		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));

  4566 	    w--;

  4567 	}

  4569 	while (w >= 4)

  4570 	{

  4571 	    save_128_aligned

  4572 		((__m128i*)dst,

  4573 		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));

  4575 	    dst += 4;

  4576 	    w -= 4;

  4577 	}

  4579 	while (w--)

  4580 	{

  4581 	    d = *dst;

  4582 	    *dst++ =

  4583 		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,

  4584 						  _mm_cvtsi32_si128 (d)));

  4585 	}

  4586     }

  4587 }

  4589 static void

  4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,

  4591 			     pixman_composite_info_t *info)

  4592 {

  4593     PIXMAN_COMPOSITE_ARGS (info);

  4594     uint32_t     *dst_line, *dst;

  4595     uint8_t     *mask_line, *mask;

  4596     int dst_stride, mask_stride;

  4597     int32_t w;

  4598     uint32_t src;

  4600     __m128i xmm_src;

  4602     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  4603     if (src == 0)

  4604 	return;

  4605     xmm_src = expand_pixel_32_1x128 (src);

  4607     PIXMAN_IMAGE_GET_LINE (

  4608 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  4609     PIXMAN_IMAGE_GET_LINE (

  4610 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  4612     while (height--)

  4613     {

  4614 	dst = dst_line;

  4615 	dst_line += dst_stride;

  4616 	mask = mask_line;

  4617 	mask_line += mask_stride;

  4618 	w = width;

  4620 	while (w && ((unsigned long)dst & 15))

  4621 	{

  4622 	    uint8_t m = *mask++;

  4623 	    if (m)

  4624 	    {

  4625 		*dst = pack_1x128_32

  4626 		    (_mm_adds_epu16

  4627 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),

  4628 		      unpack_32_1x128 (*dst)));

  4629 	    }

  4630 	    dst++;

  4631 	    w--;

  4632 	}

  4634 	while (w >= 4)

  4635 	{

  4636 	    uint32_t m = *(uint32_t*)mask;

  4637 	    if (m)

  4638 	    {

  4639 		__m128i xmm_mask_lo, xmm_mask_hi;

  4640 		__m128i xmm_dst_lo, xmm_dst_hi;

  4642 		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);

  4643 		__m128i xmm_mask =

  4644 		    _mm_unpacklo_epi8 (unpack_32_1x128(m),

  4645 				       _mm_setzero_si128 ());

  4647 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  4648 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  4650 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,

  4651 					&xmm_mask_lo, &xmm_mask_hi);

  4653 		pix_multiply_2x128 (&xmm_src, &xmm_src,

  4654 				    &xmm_mask_lo, &xmm_mask_hi,

  4655 				    &xmm_mask_lo, &xmm_mask_hi);

  4657 		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);

  4658 		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);

  4660 		save_128_aligned (

  4661 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  4662 	    }

  4664 	    w -= 4;

  4665 	    dst += 4;

  4666 	    mask += 4;

  4667 	}

  4669 	while (w)

  4670 	{

  4671 	    uint8_t m = *mask++;

  4672 	    if (m)

  4673 	    {

  4674 		*dst = pack_1x128_32

  4675 		    (_mm_adds_epu16

  4676 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),

  4677 		      unpack_32_1x128 (*dst)));

  4678 	    }

  4679 	    dst++;

  4680 	    w--;

  4681 	}

  4682     }

  4683 }

  4685 static pixman_bool_t

  4686 sse2_blt (pixman_implementation_t *imp,

  4687           uint32_t *               src_bits,

  4688           uint32_t *               dst_bits,

  4689           int                      src_stride,

  4690           int                      dst_stride,

  4691           int                      src_bpp,

  4692           int                      dst_bpp,

  4693           int                      src_x,

  4694           int                      src_y,

  4695           int                      dest_x,

  4696           int                      dest_y,

  4697           int                      width,

  4698           int                      height)

  4699 {

  4700     uint8_t *   src_bytes;

  4701     uint8_t *   dst_bytes;

  4702     int byte_width;

  4704     if (src_bpp != dst_bpp)

  4705 	return FALSE;

  4707     if (src_bpp == 16)

  4708     {

  4709 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;

  4710 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;

  4711 	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));

  4712 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));

  4713 	byte_width = 2 * width;

  4714 	src_stride *= 2;

  4715 	dst_stride *= 2;

  4716     }

  4717     else if (src_bpp == 32)

  4718     {

  4719 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;

  4720 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;

  4721 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));

  4722 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));

  4723 	byte_width = 4 * width;

  4724 	src_stride *= 4;

  4725 	dst_stride *= 4;

  4726     }

  4727     else

  4728     {

  4729 	return FALSE;

  4730     }

  4732     while (height--)

  4733     {

  4734 	int w;

  4735 	uint8_t *s = src_bytes;

  4736 	uint8_t *d = dst_bytes;

  4737 	src_bytes += src_stride;

  4738 	dst_bytes += dst_stride;

  4739 	w = byte_width;

  4741 	while (w >= 2 && ((uintptr_t)d & 3))

  4742 	{

  4743 	    *(uint16_t *)d = *(uint16_t *)s;

  4744 	    w -= 2;

  4745 	    s += 2;

  4746 	    d += 2;

  4747 	}

  4749 	while (w >= 4 && ((uintptr_t)d & 15))

  4750 	{

  4751 	    *(uint32_t *)d = *(uint32_t *)s;

  4753 	    w -= 4;

  4754 	    s += 4;

  4755 	    d += 4;

  4756 	}

  4758 	while (w >= 64)

  4759 	{

  4760 	    __m128i xmm0, xmm1, xmm2, xmm3;

  4762 	    xmm0 = load_128_unaligned ((__m128i*)(s));

  4763 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));

  4764 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));

  4765 	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));

  4767 	    save_128_aligned ((__m128i*)(d),    xmm0);

  4768 	    save_128_aligned ((__m128i*)(d + 16), xmm1);

  4769 	    save_128_aligned ((__m128i*)(d + 32), xmm2);

  4770 	    save_128_aligned ((__m128i*)(d + 48), xmm3);

  4772 	    s += 64;

  4773 	    d += 64;

  4774 	    w -= 64;

  4775 	}

  4777 	while (w >= 16)

  4778 	{

  4779 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );

  4781 	    w -= 16;

  4782 	    d += 16;

  4783 	    s += 16;

  4784 	}

  4786 	while (w >= 4)

  4787 	{

  4788 	    *(uint32_t *)d = *(uint32_t *)s;

  4790 	    w -= 4;

  4791 	    s += 4;

  4792 	    d += 4;

  4793 	}

  4795 	if (w >= 2)

  4796 	{

  4797 	    *(uint16_t *)d = *(uint16_t *)s;

  4798 	    w -= 2;

  4799 	    s += 2;

  4800 	    d += 2;

  4801 	}

  4802     }

  4804     return TRUE;

  4805 }

  4807 static void

  4808 sse2_composite_copy_area (pixman_implementation_t *imp,

  4809                           pixman_composite_info_t *info)

  4810 {

  4811     PIXMAN_COMPOSITE_ARGS (info);

  4812     sse2_blt (imp, src_image->bits.bits,

  4813 	      dest_image->bits.bits,

  4814 	      src_image->bits.rowstride,

  4815 	      dest_image->bits.rowstride,

  4816 	      PIXMAN_FORMAT_BPP (src_image->bits.format),

  4817 	      PIXMAN_FORMAT_BPP (dest_image->bits.format),

  4818 	      src_x, src_y, dest_x, dest_y, width, height);

  4819 }

  4821 static void

  4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,

  4823                                  pixman_composite_info_t *info)

  4824 {

  4825     PIXMAN_COMPOSITE_ARGS (info);

  4826     uint32_t    *src, *src_line, s;

  4827     uint32_t    *dst, *dst_line, d;

  4828     uint8_t         *mask, *mask_line;

  4829     uint32_t m;

  4830     int src_stride, mask_stride, dst_stride;

  4831     int32_t w;

  4832     __m128i ms;

  4834     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  4835     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  4836     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  4838     PIXMAN_IMAGE_GET_LINE (

  4839 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  4840     PIXMAN_IMAGE_GET_LINE (

  4841 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  4842     PIXMAN_IMAGE_GET_LINE (

  4843 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  4845     while (height--)

  4846     {

  4847         src = src_line;

  4848         src_line += src_stride;

  4849         dst = dst_line;

  4850         dst_line += dst_stride;

  4851         mask = mask_line;

  4852         mask_line += mask_stride;

  4854         w = width;

  4856         while (w && (uintptr_t)dst & 15)

  4857         {

  4858             s = 0xff000000 | *src++;

  4859             m = (uint32_t) *mask++;

  4860             d = *dst;

  4861             ms = unpack_32_1x128 (s);

  4863             if (m != 0xff)

  4864             {

  4865 		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));

  4866 		__m128i md = unpack_32_1x128 (d);

  4868                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);

  4869             }

  4871             *dst++ = pack_1x128_32 (ms);

  4872             w--;

  4873         }

  4875         while (w >= 4)

  4876         {

  4877             m = *(uint32_t*) mask;

  4878             xmm_src = _mm_or_si128 (

  4879 		load_128_unaligned ((__m128i*)src), mask_ff000000);

  4881             if (m == 0xffffffff)

  4882             {

  4883                 save_128_aligned ((__m128i*)dst, xmm_src);

  4884             }

  4885             else

  4886             {

  4887                 xmm_dst = load_128_aligned ((__m128i*)dst);

  4889                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());

  4891                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  4892                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  4893                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  4895                 expand_alpha_rev_2x128 (

  4896 		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  4898                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,

  4899 			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,

  4900 			       &xmm_dst_lo, &xmm_dst_hi);

  4902                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  4903             }

  4905             src += 4;

  4906             dst += 4;

  4907             mask += 4;

  4908             w -= 4;

  4909         }

  4911         while (w)

  4912         {

  4913             m = (uint32_t) *mask++;

  4915             if (m)

  4916             {

  4917                 s = 0xff000000 | *src;

  4919                 if (m == 0xff)

  4920                 {

  4921                     *dst = s;

  4922                 }

  4923                 else

  4924                 {

  4925 		    __m128i ma, md, ms;

  4927                     d = *dst;

  4929 		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));

  4930 		    md = unpack_32_1x128 (d);

  4931 		    ms = unpack_32_1x128 (s);

  4933                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));

  4934                 }

  4936             }

  4938             src++;

  4939             dst++;

  4940             w--;

  4941         }

  4942     }

  4944 }

  4946 static void

  4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,

  4948                                  pixman_composite_info_t *info)

  4949 {

  4950     PIXMAN_COMPOSITE_ARGS (info);

  4951     uint32_t    *src, *src_line, s;

  4952     uint32_t    *dst, *dst_line, d;

  4953     uint8_t         *mask, *mask_line;

  4954     uint32_t m;

  4955     int src_stride, mask_stride, dst_stride;

  4956     int32_t w;

  4958     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;

  4959     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  4960     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  4962     PIXMAN_IMAGE_GET_LINE (

  4963 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  4964     PIXMAN_IMAGE_GET_LINE (

  4965 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);

  4966     PIXMAN_IMAGE_GET_LINE (

  4967 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  4969     while (height--)

  4970     {

  4971         src = src_line;

  4972         src_line += src_stride;

  4973         dst = dst_line;

  4974         dst_line += dst_stride;

  4975         mask = mask_line;

  4976         mask_line += mask_stride;

  4978         w = width;

  4980         while (w && (uintptr_t)dst & 15)

  4981         {

  4982 	    uint32_t sa;

  4984             s = *src++;

  4985             m = (uint32_t) *mask++;

  4986             d = *dst;

  4988 	    sa = s >> 24;

  4990 	    if (m)

  4991 	    {

  4992 		if (sa == 0xff && m == 0xff)

  4993 		{

  4994 		    *dst = s;

  4995 		}

  4996 		else

  4997 		{

  4998 		    __m128i ms, md, ma, msa;

  5000 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));

  5001 		    ms = unpack_32_1x128 (s);

  5002 		    md = unpack_32_1x128 (d);

  5004 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));

  5006 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));

  5007 		}

  5008 	    }

  5010 	    dst++;

  5011             w--;

  5012         }

  5014         while (w >= 4)

  5015         {

  5016             m = *(uint32_t *) mask;

  5018 	    if (m)

  5019 	    {

  5020 		xmm_src = load_128_unaligned ((__m128i*)src);

  5022 		if (m == 0xffffffff && is_opaque (xmm_src))

  5023 		{

  5024 		    save_128_aligned ((__m128i *)dst, xmm_src);

  5025 		}

  5026 		else

  5027 		{

  5028 		    xmm_dst = load_128_aligned ((__m128i *)dst);

  5030 		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());

  5032 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  5033 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  5034 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  5036 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);

  5037 		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  5039 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,

  5040 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);

  5042 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  5043 		}

  5044 	    }

  5046             src += 4;

  5047             dst += 4;

  5048             mask += 4;

  5049             w -= 4;

  5050         }

  5052         while (w)

  5053         {

  5054 	    uint32_t sa;

  5056             s = *src++;

  5057             m = (uint32_t) *mask++;

  5058             d = *dst;

  5060 	    sa = s >> 24;

  5062 	    if (m)

  5063 	    {

  5064 		if (sa == 0xff && m == 0xff)

  5065 		{

  5066 		    *dst = s;

  5067 		}

  5068 		else

  5069 		{

  5070 		    __m128i ms, md, ma, msa;

  5072 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));

  5073 		    ms = unpack_32_1x128 (s);

  5074 		    md = unpack_32_1x128 (d);

  5076 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));

  5078 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));

  5079 		}

  5080 	    }

  5082 	    dst++;

  5083             w--;

  5084         }

  5085     }

  5087 }

  5089 static void

  5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,

  5091 				    pixman_composite_info_t *info)

  5092 {

  5093     PIXMAN_COMPOSITE_ARGS (info);

  5094     uint32_t src;

  5095     uint32_t    *dst_line, *dst;

  5096     __m128i xmm_src;

  5097     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  5098     __m128i xmm_dsta_hi, xmm_dsta_lo;

  5099     int dst_stride;

  5100     int32_t w;

  5102     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);

  5104     if (src == 0)

  5105 	return;

  5107     PIXMAN_IMAGE_GET_LINE (

  5108 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  5110     xmm_src = expand_pixel_32_1x128 (src);

  5112     while (height--)

  5113     {

  5114 	dst = dst_line;

  5116 	dst_line += dst_stride;

  5117 	w = width;

  5119 	while (w && (uintptr_t)dst & 15)

  5120 	{

  5121 	    __m128i vd;

  5123 	    vd = unpack_32_1x128 (*dst);

  5125 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),

  5126 					      xmm_src));

  5127 	    w--;

  5128 	    dst++;

  5129 	}

  5131 	while (w >= 4)

  5132 	{

  5133 	    __m128i tmp_lo, tmp_hi;

  5135 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  5137 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  5138 	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);

  5140 	    tmp_lo = xmm_src;

  5141 	    tmp_hi = xmm_src;

  5143 	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,

  5144 			&xmm_dsta_lo, &xmm_dsta_hi,

  5145 			&tmp_lo, &tmp_hi);

  5147 	    save_128_aligned (

  5148 		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));

  5150 	    w -= 4;

  5151 	    dst += 4;

  5152 	}

  5154 	while (w)

  5155 	{

  5156 	    __m128i vd;

  5158 	    vd = unpack_32_1x128 (*dst);

  5160 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),

  5161 					      xmm_src));

  5162 	    w--;

  5163 	    dst++;

  5164 	}

  5166     }

  5168 }

  5170 static void

  5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,

  5172 				    pixman_composite_info_t *info)

  5173 {

  5174     PIXMAN_COMPOSITE_ARGS (info);

  5175     uint32_t    *src, *src_line, s;

  5176     uint32_t    *dst, *dst_line, d;

  5177     uint32_t    *mask, *mask_line;

  5178     uint32_t    m;

  5179     int src_stride, mask_stride, dst_stride;

  5180     int32_t w;

  5182     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;

  5183     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  5184     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  5186     PIXMAN_IMAGE_GET_LINE (

  5187 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);

  5188     PIXMAN_IMAGE_GET_LINE (

  5189 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);

  5190     PIXMAN_IMAGE_GET_LINE (

  5191 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);

  5193     while (height--)

  5194     {

  5195         src = src_line;

  5196         src_line += src_stride;

  5197         dst = dst_line;

  5198         dst_line += dst_stride;

  5199         mask = mask_line;

  5200         mask_line += mask_stride;

  5202         w = width;

  5204         while (w && (uintptr_t)dst & 15)

  5205         {

  5206 	    uint32_t sa;

  5208             s = *src++;

  5209             m = (*mask++) >> 24;

  5210             d = *dst;

  5212 	    sa = s >> 24;

  5214 	    if (m)

  5215 	    {

  5216 		if (sa == 0xff && m == 0xff)

  5217 		{

  5218 		    *dst = s;

  5219 		}

  5220 		else

  5221 		{

  5222 		    __m128i ms, md, ma, msa;

  5224 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));

  5225 		    ms = unpack_32_1x128 (s);

  5226 		    md = unpack_32_1x128 (d);

  5228 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));

  5230 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));

  5231 		}

  5232 	    }

  5234 	    dst++;

  5235             w--;

  5236         }

  5238         while (w >= 4)

  5239         {

  5240 	    xmm_mask = load_128_unaligned ((__m128i*)mask);

  5242 	    if (!is_transparent (xmm_mask))

  5243 	    {

  5244 		xmm_src = load_128_unaligned ((__m128i*)src);

  5246 		if (is_opaque (xmm_mask) && is_opaque (xmm_src))

  5247 		{

  5248 		    save_128_aligned ((__m128i *)dst, xmm_src);

  5249 		}

  5250 		else

  5251 		{

  5252 		    xmm_dst = load_128_aligned ((__m128i *)dst);

  5254 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  5255 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  5256 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  5258 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);

  5259 		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  5261 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,

  5262 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);

  5264 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  5265 		}

  5266 	    }

  5268             src += 4;

  5269             dst += 4;

  5270             mask += 4;

  5271             w -= 4;

  5272         }

  5274         while (w)

  5275         {

  5276 	    uint32_t sa;

  5278             s = *src++;

  5279             m = (*mask++) >> 24;

  5280             d = *dst;

  5282 	    sa = s >> 24;

  5284 	    if (m)

  5285 	    {

  5286 		if (sa == 0xff && m == 0xff)

  5287 		{

  5288 		    *dst = s;

  5289 		}

  5290 		else

  5291 		{

  5292 		    __m128i ms, md, ma, msa;

  5294 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));

  5295 		    ms = unpack_32_1x128 (s);

  5296 		    md = unpack_32_1x128 (d);

  5298 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));

  5300 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));

  5301 		}

  5302 	    }

  5304 	    dst++;

  5305             w--;

  5306         }

  5307     }

  5309 }

  5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */

  5312 static force_inline void

  5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,

  5314                                              const uint32_t* ps,

  5315                                              int32_t         w,

  5316                                              pixman_fixed_t  vx,

  5317                                              pixman_fixed_t  unit_x,

  5318                                              pixman_fixed_t  src_width_fixed,

  5319                                              pixman_bool_t   fully_transparent_src)

  5320 {

  5321     uint32_t s, d;

  5322     const uint32_t* pm = NULL;

  5324     __m128i xmm_dst_lo, xmm_dst_hi;

  5325     __m128i xmm_src_lo, xmm_src_hi;

  5326     __m128i xmm_alpha_lo, xmm_alpha_hi;

  5328     if (fully_transparent_src)

  5329 	return;

  5331     /* Align dst on a 16-byte boundary */

  5332     while (w && ((uintptr_t)pd & 15))

  5333     {

  5334 	d = *pd;

  5335 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);

  5336 	vx += unit_x;

  5337 	while (vx >= 0)

  5338 	    vx -= src_width_fixed;

  5340 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);

  5341 	if (pm)

  5342 	    pm++;

  5343 	w--;

  5344     }

  5346     while (w >= 4)

  5347     {

  5348 	__m128i tmp;

  5349 	uint32_t tmp1, tmp2, tmp3, tmp4;

  5351 	tmp1 = *(ps + pixman_fixed_to_int (vx));

  5352 	vx += unit_x;

  5353 	while (vx >= 0)

  5354 	    vx -= src_width_fixed;

  5355 	tmp2 = *(ps + pixman_fixed_to_int (vx));

  5356 	vx += unit_x;

  5357 	while (vx >= 0)

  5358 	    vx -= src_width_fixed;

  5359 	tmp3 = *(ps + pixman_fixed_to_int (vx));

  5360 	vx += unit_x;

  5361 	while (vx >= 0)

  5362 	    vx -= src_width_fixed;

  5363 	tmp4 = *(ps + pixman_fixed_to_int (vx));

  5364 	vx += unit_x;

  5365 	while (vx >= 0)

  5366 	    vx -= src_width_fixed;

  5368 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);

  5370 	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);

  5372 	if (is_opaque (xmm_src_hi))

  5373 	{

  5374 	    save_128_aligned ((__m128i*)pd, xmm_src_hi);

  5375 	}

  5376 	else if (!is_zero (xmm_src_hi))

  5377 	{

  5378 	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);

  5380 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);

  5381 	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);

  5383 	    expand_alpha_2x128 (

  5384 		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);

  5386 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,

  5387 			&xmm_alpha_lo, &xmm_alpha_hi,

  5388 			&xmm_dst_lo, &xmm_dst_hi);

  5390 	    /* rebuid the 4 pixel data and save*/

  5391 	    save_128_aligned ((__m128i*)pd,

  5392 			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  5393 	}

  5395 	w -= 4;

  5396 	pd += 4;

  5397 	if (pm)

  5398 	    pm += 4;

  5399     }

  5401     while (w)

  5402     {

  5403 	d = *pd;

  5404 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);

  5405 	vx += unit_x;

  5406 	while (vx >= 0)

  5407 	    vx -= src_width_fixed;

  5409 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);

  5410 	if (pm)

  5411 	    pm++;

  5413 	w--;

  5414     }

  5415 }

  5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,

  5418 		       scaled_nearest_scanline_sse2_8888_8888_OVER,

  5419 		       uint32_t, uint32_t, COVER)

  5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,

  5421 		       scaled_nearest_scanline_sse2_8888_8888_OVER,

  5422 		       uint32_t, uint32_t, NONE)

  5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,

  5424 		       scaled_nearest_scanline_sse2_8888_8888_OVER,

  5425 		       uint32_t, uint32_t, PAD)

  5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,

  5427 		       scaled_nearest_scanline_sse2_8888_8888_OVER,

  5428 		       uint32_t, uint32_t, NORMAL)

  5430 static force_inline void

  5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,

  5432 					       uint32_t *       dst,

  5433 					       const uint32_t * src,

  5434 					       int32_t          w,

  5435 					       pixman_fixed_t   vx,

  5436 					       pixman_fixed_t   unit_x,

  5437 					       pixman_fixed_t   src_width_fixed,

  5438 					       pixman_bool_t    zero_src)

  5439 {

  5440     __m128i xmm_mask;

  5441     __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  5442     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  5443     __m128i xmm_alpha_lo, xmm_alpha_hi;

  5445     if (zero_src || (*mask >> 24) == 0)

  5446 	return;

  5448     xmm_mask = create_mask_16_128 (*mask >> 24);

  5450     while (w && (uintptr_t)dst & 15)

  5451     {

  5452 	uint32_t s = *(src + pixman_fixed_to_int (vx));

  5453 	vx += unit_x;

  5454 	while (vx >= 0)

  5455 	    vx -= src_width_fixed;

  5457 	if (s)

  5458 	{

  5459 	    uint32_t d = *dst;

  5461 	    __m128i ms = unpack_32_1x128 (s);

  5462 	    __m128i alpha     = expand_alpha_1x128 (ms);

  5463 	    __m128i dest      = xmm_mask;

  5464 	    __m128i alpha_dst = unpack_32_1x128 (d);

  5466 	    *dst = pack_1x128_32 (

  5467 		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));

  5468 	}

  5469 	dst++;

  5470 	w--;

  5471     }

  5473     while (w >= 4)

  5474     {

  5475 	uint32_t tmp1, tmp2, tmp3, tmp4;

  5477 	tmp1 = *(src + pixman_fixed_to_int (vx));

  5478 	vx += unit_x;

  5479 	while (vx >= 0)

  5480 	    vx -= src_width_fixed;

  5481 	tmp2 = *(src + pixman_fixed_to_int (vx));

  5482 	vx += unit_x;

  5483 	while (vx >= 0)

  5484 	    vx -= src_width_fixed;

  5485 	tmp3 = *(src + pixman_fixed_to_int (vx));

  5486 	vx += unit_x;

  5487 	while (vx >= 0)

  5488 	    vx -= src_width_fixed;

  5489 	tmp4 = *(src + pixman_fixed_to_int (vx));

  5490 	vx += unit_x;

  5491 	while (vx >= 0)

  5492 	    vx -= src_width_fixed;

  5494 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);

  5496 	if (!is_zero (xmm_src))

  5497 	{

  5498 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  5500 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  5501 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  5502 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  5503 			        &xmm_alpha_lo, &xmm_alpha_hi);

  5505 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,

  5506 			   &xmm_alpha_lo, &xmm_alpha_hi,

  5507 			   &xmm_mask, &xmm_mask,

  5508 			   &xmm_dst_lo, &xmm_dst_hi);

  5510 	    save_128_aligned (

  5511 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  5512 	}

  5514 	dst += 4;

  5515 	w -= 4;

  5516     }

  5518     while (w)

  5519     {

  5520 	uint32_t s = *(src + pixman_fixed_to_int (vx));

  5521 	vx += unit_x;

  5522 	while (vx >= 0)

  5523 	    vx -= src_width_fixed;

  5525 	if (s)

  5526 	{

  5527 	    uint32_t d = *dst;

  5529 	    __m128i ms = unpack_32_1x128 (s);

  5530 	    __m128i alpha = expand_alpha_1x128 (ms);

  5531 	    __m128i mask  = xmm_mask;

  5532 	    __m128i dest  = unpack_32_1x128 (d);

  5534 	    *dst = pack_1x128_32 (

  5535 		in_over_1x128 (&ms, &alpha, &mask, &dest));

  5536 	}

  5538 	dst++;

  5539 	w--;

  5540     }

  5542 }

  5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,

  5545 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,

  5546 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)

  5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,

  5548 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,

  5549 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)

  5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,

  5551 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,

  5552 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)

  5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,

  5554 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,

  5555 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)

  5557 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)

  5559 #define BILINEAR_DECLARE_VARIABLES						\

  5560     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\

  5561     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\

  5562     const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\

  5563     const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\

  5564     const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\

  5565     const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\

  5566     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\

  5567 					  unit_x, unit_x, unit_x, unit_x);	\

  5568     const __m128i xmm_zero = _mm_setzero_si128 ();				\

  5569     __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)

  5571 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\

  5572 do {										\

  5573     __m128i xmm_wh, xmm_lo, xmm_hi, a;						\

  5574     /* fetch 2x2 pixel block into sse2 registers */				\

  5575     __m128i tltr = _mm_loadl_epi64 (						\

  5576 			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\

  5577     __m128i blbr = _mm_loadl_epi64 (						\

  5578 			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\

  5579     vx += unit_x;								\

  5580     /* vertical interpolation */						\

  5581     a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\

  5582 					xmm_wt),				\

  5583 		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\

  5584 					xmm_wb));				\

  5585     if (BILINEAR_INTERPOLATION_BITS < 8)					\

  5586     {										\

  5587 	/* calculate horizontal weights */					\

  5588 	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\

  5589 		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\

  5590 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\

  5591 	/* horizontal interpolation */						\

  5592 	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\

  5593 		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\

  5594     }										\

  5595     else									\

  5596     {										\

  5597 	/* calculate horizontal weights */					\

  5598 	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\

  5599 		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\

  5600 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\

  5601 	/* horizontal interpolation */						\

  5602 	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\

  5603 	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\

  5604 	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\

  5605 			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\

  5606     }										\

  5607     /* shift and pack the result */						\

  5608     a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\

  5609     a = _mm_packs_epi32 (a, a);							\

  5610     a = _mm_packus_epi16 (a, a);						\

  5611     pix = _mm_cvtsi128_si32 (a);						\

  5612 } while (0)

  5614 #define BILINEAR_SKIP_ONE_PIXEL()						\

  5615 do {										\

  5616     vx += unit_x;								\

  5617     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\

  5618 } while(0)

  5620 static force_inline void

  5621 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,

  5622 					     const uint32_t * mask,

  5623 					     const uint32_t * src_top,

  5624 					     const uint32_t * src_bottom,

  5625 					     int32_t          w,

  5626 					     int              wt,

  5627 					     int              wb,

  5628 					     pixman_fixed_t   vx,

  5629 					     pixman_fixed_t   unit_x,

  5630 					     pixman_fixed_t   max_vx,

  5631 					     pixman_bool_t    zero_src)

  5632 {

  5633     BILINEAR_DECLARE_VARIABLES;

  5634     uint32_t pix1, pix2, pix3, pix4;

  5636     while ((w -= 4) >= 0)

  5637     {

  5638 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5639 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);

  5640 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);

  5641 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);

  5642 	*dst++ = pix1;

  5643 	*dst++ = pix2;

  5644 	*dst++ = pix3;

  5645 	*dst++ = pix4;

  5646     }

  5648     if (w & 2)

  5649     {

  5650 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5651 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);

  5652 	*dst++ = pix1;

  5653 	*dst++ = pix2;

  5654     }

  5656     if (w & 1)

  5657     {

  5658 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5659 	*dst = pix1;

  5660     }

  5662 }

  5664 /* Add extra NULL argument to the existing bilinear fast paths to indicate

  5665  * that we don't need two-pass processing */

  5667 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,

  5668 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,

  5669 			       uint32_t, uint32_t, uint32_t,

  5670 			       COVER, FLAG_NONE)

  5671 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,

  5672 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,

  5673 			       uint32_t, uint32_t, uint32_t,

  5674 			       PAD, FLAG_NONE)

  5675 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,

  5676 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,

  5677 			       uint32_t, uint32_t, uint32_t,

  5678 			       NONE, FLAG_NONE)

  5679 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,

  5680 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,

  5681 			       uint32_t, uint32_t, uint32_t,

  5682 			       NORMAL, FLAG_NONE)

  5684 static force_inline void

  5685 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,

  5686 					      const uint32_t * mask,

  5687 					      const uint32_t * src_top,

  5688 					      const uint32_t * src_bottom,

  5689 					      int32_t          w,

  5690 					      int              wt,

  5691 					      int              wb,

  5692 					      pixman_fixed_t   vx,

  5693 					      pixman_fixed_t   unit_x,

  5694 					      pixman_fixed_t   max_vx,

  5695 					      pixman_bool_t    zero_src)

  5696 {

  5697     BILINEAR_DECLARE_VARIABLES;

  5698     uint32_t pix1, pix2, pix3, pix4;

  5700     while (w && ((uintptr_t)dst & 15))

  5701     {

  5702 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5704 	if (pix1)

  5705 	{

  5706 	    pix2 = *dst;

  5707 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);

  5708 	}

  5710 	w--;

  5711 	dst++;

  5712     }

  5714     while (w  >= 4)

  5715     {

  5716 	__m128i xmm_src;

  5717 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;

  5718 	__m128i xmm_alpha_hi, xmm_alpha_lo;

  5720 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5721 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);

  5722 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);

  5723 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);

  5725 	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);

  5727 	if (!is_zero (xmm_src))

  5728 	{

  5729 	    if (is_opaque (xmm_src))

  5730 	    {

  5731 		save_128_aligned ((__m128i *)dst, xmm_src);

  5732 	    }

  5733 	    else

  5734 	    {

  5735 		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);

  5737 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  5738 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  5740 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);

  5741 		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,

  5742 			    &xmm_dst_lo, &xmm_dst_hi);

  5744 		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  5745 	    }

  5746 	}

  5748 	w -= 4;

  5749 	dst += 4;

  5750     }

  5752     while (w)

  5753     {

  5754 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5756 	if (pix1)

  5757 	{

  5758 	    pix2 = *dst;

  5759 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);

  5760 	}

  5762 	w--;

  5763 	dst++;

  5764     }

  5765 }

  5767 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,

  5768 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,

  5769 			       uint32_t, uint32_t, uint32_t,

  5770 			       COVER, FLAG_NONE)

  5771 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,

  5772 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,

  5773 			       uint32_t, uint32_t, uint32_t,

  5774 			       PAD, FLAG_NONE)

  5775 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,

  5776 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,

  5777 			       uint32_t, uint32_t, uint32_t,

  5778 			       NONE, FLAG_NONE)

  5779 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,

  5780 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,

  5781 			       uint32_t, uint32_t, uint32_t,

  5782 			       NORMAL, FLAG_NONE)

  5785 /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented

  5786    as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */

  5788 void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)

  5789 {

  5790     /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */

  5791     while (--width >= 0)

  5792     {

  5793 	*dst = composite_over_8888_0565pixel (*src, *dst);

  5794 	src++;

  5795 	dst++;

  5796     }

  5797 }

  5799 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,

  5800 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,

  5801 			       uint32_t, uint32_t, uint16_t,

  5802 			       COVER, FLAG_NONE)

  5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,

  5804 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,

  5805 			       uint32_t, uint32_t, uint16_t,

  5806 			       PAD, FLAG_NONE)

  5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,

  5808 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,

  5809 			       uint32_t, uint32_t, uint16_t,

  5810 			       NONE, FLAG_NONE)

  5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,

  5812 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,

  5813 			       uint32_t, uint32_t, uint16_t,

  5814 			       NORMAL, FLAG_NONE)

  5816 /*****************************/

  5818 static force_inline void

  5819 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,

  5820 						const uint8_t  * mask,

  5821 						const uint32_t * src_top,

  5822 						const uint32_t * src_bottom,

  5823 						int32_t          w,

  5824 						int              wt,

  5825 						int              wb,

  5826 						pixman_fixed_t   vx,

  5827 						pixman_fixed_t   unit_x,

  5828 						pixman_fixed_t   max_vx,

  5829 						pixman_bool_t    zero_src)

  5830 {

  5831     BILINEAR_DECLARE_VARIABLES;

  5832     uint32_t pix1, pix2, pix3, pix4;

  5833     uint32_t m;

  5835     while (w && ((uintptr_t)dst & 15))

  5836     {

  5837 	uint32_t sa;

  5839 	m = (uint32_t) *mask++;

  5841 	if (m)

  5842 	{

  5843 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5844 	    sa = pix1 >> 24;

  5846 	    if (sa == 0xff && m == 0xff)

  5847 	    {

  5848 		*dst = pix1;

  5849 	    }

  5850 	    else

  5851 	    {

  5852 		__m128i ms, md, ma, msa;

  5854 		pix2 = *dst;

  5855 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));

  5856 		ms = unpack_32_1x128 (pix1);

  5857 		md = unpack_32_1x128 (pix2);

  5859 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));

  5861 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));

  5862 	    }

  5863 	}

  5864 	else

  5865 	{

  5866 	    BILINEAR_SKIP_ONE_PIXEL ();

  5867 	}

  5869 	w--;

  5870 	dst++;

  5871     }

  5873     while (w >= 4)

  5874     {

  5875 	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;

  5876 	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  5877 	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;

  5879 	m = *(uint32_t*)mask;

  5881 	if (m)

  5882 	{

  5883 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5884 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);

  5885 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);

  5886 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);

  5888 	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);

  5890 	    if (m == 0xffffffff && is_opaque (xmm_src))

  5891 	    {

  5892 		save_128_aligned ((__m128i *)dst, xmm_src);

  5893 	    }

  5894 	    else

  5895 	    {

  5896 		xmm_dst = load_128_aligned ((__m128i *)dst);

  5898 		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());

  5900 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  5901 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);

  5902 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  5904 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);

  5905 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);

  5907 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,

  5908 			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);

  5910 		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  5911 	    }

  5912 	}

  5913 	else

  5914 	{

  5915 	    BILINEAR_SKIP_ONE_PIXEL ();

  5916 	    BILINEAR_SKIP_ONE_PIXEL ();

  5917 	    BILINEAR_SKIP_ONE_PIXEL ();

  5918 	    BILINEAR_SKIP_ONE_PIXEL ();

  5919 	}

  5921 	w -= 4;

  5922 	dst += 4;

  5923 	mask += 4;

  5924     }

  5926     while (w)

  5927     {

  5928 	uint32_t sa;

  5930 	m = (uint32_t) *mask++;

  5932 	if (m)

  5933 	{

  5934 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  5935 	    sa = pix1 >> 24;

  5937 	    if (sa == 0xff && m == 0xff)

  5938 	    {

  5939 		*dst = pix1;

  5940 	    }

  5941 	    else

  5942 	    {

  5943 		__m128i ms, md, ma, msa;

  5945 		pix2 = *dst;

  5946 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));

  5947 		ms = unpack_32_1x128 (pix1);

  5948 		md = unpack_32_1x128 (pix2);

  5950 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));

  5952 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));

  5953 	    }

  5954 	}

  5955 	else

  5956 	{

  5957 	    BILINEAR_SKIP_ONE_PIXEL ();

  5958 	}

  5960 	w--;

  5961 	dst++;

  5962     }

  5963 }

  5965 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,

  5966 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,

  5967 			       uint32_t, uint8_t, uint32_t,

  5968 			       COVER, FLAG_HAVE_NON_SOLID_MASK)

  5969 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,

  5970 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,

  5971 			       uint32_t, uint8_t, uint32_t,

  5972 			       PAD, FLAG_HAVE_NON_SOLID_MASK)

  5973 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,

  5974 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,

  5975 			       uint32_t, uint8_t, uint32_t,

  5976 			       NONE, FLAG_HAVE_NON_SOLID_MASK)

  5977 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,

  5978 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,

  5979 			       uint32_t, uint8_t, uint32_t,

  5980 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)

  5982 static force_inline void

  5983 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,

  5984 						const uint32_t * mask,

  5985 						const uint32_t * src_top,

  5986 						const uint32_t * src_bottom,

  5987 						int32_t          w,

  5988 						int              wt,

  5989 						int              wb,

  5990 						pixman_fixed_t   vx,

  5991 						pixman_fixed_t   unit_x,

  5992 						pixman_fixed_t   max_vx,

  5993 						pixman_bool_t    zero_src)

  5994 {

  5995     BILINEAR_DECLARE_VARIABLES;

  5996     uint32_t pix1, pix2, pix3, pix4;

  5997     __m128i xmm_mask;

  5999     if (zero_src || (*mask >> 24) == 0)

  6000 	return;

  6002     xmm_mask = create_mask_16_128 (*mask >> 24);

  6004     while (w && ((uintptr_t)dst & 15))

  6005     {

  6006 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  6007 	if (pix1)

  6008 	{

  6009 		uint32_t d = *dst;

  6011 		__m128i ms = unpack_32_1x128 (pix1);

  6012 		__m128i alpha     = expand_alpha_1x128 (ms);

  6013 		__m128i dest      = xmm_mask;

  6014 		__m128i alpha_dst = unpack_32_1x128 (d);

  6016 		*dst = pack_1x128_32

  6017 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));

  6018 	}

  6020 	dst++;

  6021 	w--;

  6022     }

  6024     while (w >= 4)

  6025     {

  6026 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  6027 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);

  6028 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);

  6029 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);

  6031 	if (pix1 | pix2 | pix3 | pix4)

  6032 	{

  6033 	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;

  6034 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;

  6035 	    __m128i xmm_alpha_lo, xmm_alpha_hi;

  6037 	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);

  6039 	    xmm_dst = load_128_aligned ((__m128i*)dst);

  6041 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);

  6042 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);

  6043 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,

  6044 				&xmm_alpha_lo, &xmm_alpha_hi);

  6046 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,

  6047 			   &xmm_alpha_lo, &xmm_alpha_hi,

  6048 			   &xmm_mask, &xmm_mask,

  6049 			   &xmm_dst_lo, &xmm_dst_hi);

  6051 	    save_128_aligned

  6052 		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));

  6053 	}

  6055 	dst += 4;

  6056 	w -= 4;

  6057     }

  6059     while (w)

  6060     {

  6061 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);

  6062 	if (pix1)

  6063 	{

  6064 		uint32_t d = *dst;

  6066 		__m128i ms = unpack_32_1x128 (pix1);

  6067 		__m128i alpha     = expand_alpha_1x128 (ms);

  6068 		__m128i dest      = xmm_mask;

  6069 		__m128i alpha_dst = unpack_32_1x128 (d);

  6071 		*dst = pack_1x128_32

  6072 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));

  6073 	}

  6075 	dst++;

  6076 	w--;

  6077     }

  6078 }

  6080 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,

  6081 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,

  6082 			       uint32_t, uint32_t, uint32_t,

  6083 			       COVER, FLAG_HAVE_SOLID_MASK)

  6084 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,

  6085 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,

  6086 			       uint32_t, uint32_t, uint32_t,

  6087 			       PAD, FLAG_HAVE_SOLID_MASK)

  6088 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,

  6089 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,

  6090 			       uint32_t, uint32_t, uint32_t,

  6091 			       NONE, FLAG_HAVE_SOLID_MASK)

  6092 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,

  6093 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,

  6094 			       uint32_t, uint32_t, uint32_t,

  6095 			       NORMAL, FLAG_HAVE_SOLID_MASK)

  6097 static const pixman_fast_path_t sse2_fast_paths[] =

  6098 {

  6099     /* PIXMAN_OP_OVER */

  6100     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),

  6101     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),

  6102     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),

  6103     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),

  6104     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),

  6105     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),

  6106     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),

  6107     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),

  6108     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),

  6109     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),

  6110     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),

  6111     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),

  6112     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),

  6113     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),

  6114     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),

  6115     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),

  6116     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),

  6117     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),

  6118     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),

  6119     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),

  6120     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),

  6121     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),

  6122     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),

  6123     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),

  6124     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),

  6125     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),

  6126     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),

  6127     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),

  6128     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),

  6129     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),

  6130     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),

  6131     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),

  6132     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),

  6133     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),

  6134     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),

  6135     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),

  6136     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),

  6137     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),

  6138     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),

  6139     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),

  6140     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),

  6141     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),

  6142     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),

  6143     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),

  6144     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),

  6145     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),

  6146     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),

  6148     /* PIXMAN_OP_OVER_REVERSE */

  6149     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),

  6150     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),

  6152     /* PIXMAN_OP_ADD */

  6153     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),

  6154     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),

  6155     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),

  6156     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),

  6157     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),

  6158     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),

  6159     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),

  6160     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),

  6161     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),

  6162     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),

  6163     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),

  6164     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),

  6165     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),

  6166     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),

  6168     /* PIXMAN_OP_SRC */

  6169     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),

  6170     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),

  6171     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),

  6172     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),

  6173     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),

  6174     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),

  6175     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),

  6176     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),

  6177     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),

  6178     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),

  6179     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),

  6180     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),

  6181     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),

  6182     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),

  6183     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),

  6184     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),

  6185     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),

  6186     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),

  6188     /* PIXMAN_OP_IN */

  6189     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),

  6190     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),

  6191     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),

  6193     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6194     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6195     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),

  6196     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),

  6197     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6198     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6199     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),

  6200     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),

  6201     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6202     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6203     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),

  6204     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),

  6205     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6206     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6207     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),

  6208     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),

  6210     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),

  6211     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),

  6212     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),

  6213     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),

  6214     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),

  6215     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),

  6216     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),

  6217     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),

  6219     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),

  6220     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6221     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6222     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),

  6223     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6224     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6226     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),

  6227     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),

  6228     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),

  6229     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),

  6231     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),

  6232     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),

  6233     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),

  6234     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),

  6236     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),

  6237     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),

  6238     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),

  6239     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),

  6241     /* and here the needed entries are added to the fast path table */

  6243     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),

  6244     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),

  6246     { PIXMAN_OP_NONE },

  6247 };

  6249 static uint32_t *

  6250 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)

  6251 {

  6252     int w = iter->width;

  6253     __m128i ff000000 = mask_ff000000;

  6254     uint32_t *dst = iter->buffer;

  6255     uint32_t *src = (uint32_t *)iter->bits;

  6257     iter->bits += iter->stride;

  6259     while (w && ((uintptr_t)dst) & 0x0f)

  6260     {

  6261 	*dst++ = (*src++) | 0xff000000;

  6262 	w--;

  6263     }

  6265     while (w >= 4)

  6266     {

  6267 	save_128_aligned (

  6268 	    (__m128i *)dst, _mm_or_si128 (

  6269 		load_128_unaligned ((__m128i *)src), ff000000));

  6271 	dst += 4;

  6272 	src += 4;

  6273 	w -= 4;

  6274     }

  6276     while (w)

  6277     {

  6278 	*dst++ = (*src++) | 0xff000000;

  6279 	w--;

  6280     }

  6282     return iter->buffer;

  6283 }

  6285 static uint32_t *

  6286 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)

  6287 {

  6288     int w = iter->width;

  6289     uint32_t *dst = iter->buffer;

  6290     uint16_t *src = (uint16_t *)iter->bits;

  6291     __m128i ff000000 = mask_ff000000;

  6293     iter->bits += iter->stride;

  6295     while (w && ((uintptr_t)dst) & 0x0f)

  6296     {

  6297 	uint16_t s = *src++;

  6299 	*dst++ = convert_0565_to_8888 (s);

  6300 	w--;

  6301     }

  6303     while (w >= 8)

  6304     {

  6305 	__m128i lo, hi, s;

  6307 	s = _mm_loadu_si128 ((__m128i *)src);

  6309 	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));

  6310 	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));

  6312 	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));

  6313 	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));

  6315 	dst += 8;

  6316 	src += 8;

  6317 	w -= 8;

  6318     }

  6320     while (w)

  6321     {

  6322 	uint16_t s = *src++;

  6324 	*dst++ = convert_0565_to_8888 (s);

  6325 	w--;

  6326     }

  6328     return iter->buffer;

  6329 }

  6331 static uint32_t *

  6332 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)

  6333 {

  6334     int w = iter->width;

  6335     uint32_t *dst = iter->buffer;

  6336     uint8_t *src = iter->bits;

  6337     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;

  6339     iter->bits += iter->stride;

  6341     while (w && (((uintptr_t)dst) & 15))

  6342     {

  6343         *dst++ = *(src++) << 24;

  6344         w--;

  6345     }

  6347     while (w >= 16)

  6348     {

  6349 	xmm0 = _mm_loadu_si128((__m128i *)src);

  6351 	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);

  6352 	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);

  6353 	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);

  6354 	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);

  6355 	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);

  6356 	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);

  6358 	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);

  6359 	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);

  6360 	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);

  6361 	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);

  6363 	dst += 16;

  6364 	src += 16;

  6365 	w -= 16;

  6366     }

  6368     while (w)

  6369     {

  6370 	*dst++ = *(src++) << 24;

  6371 	w--;

  6372     }

  6374     return iter->buffer;

  6375 }

  6377 typedef struct

  6378 {

  6379     pixman_format_code_t	format;

  6380     pixman_iter_get_scanline_t	get_scanline;

  6381 } fetcher_info_t;

  6383 static const fetcher_info_t fetchers[] =

  6384 {

  6385     { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },

  6386     { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },

  6387     { PIXMAN_a8,		sse2_fetch_a8 },

  6388     { PIXMAN_null }

  6389 };

  6391 static pixman_bool_t

  6392 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)

  6393 {

  6394     pixman_image_t *image = iter->image;

  6396 #define FLAGS								\

  6397     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\

  6398      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)

  6400     if ((iter->iter_flags & ITER_NARROW)			&&

  6401 	(iter->image_flags & FLAGS) == FLAGS)

  6402     {

  6403 	const fetcher_info_t *f;

  6405 	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)

  6406 	{

  6407 	    if (image->common.extended_format_code == f->format)

  6408 	    {

  6409 		uint8_t *b = (uint8_t *)image->bits.bits;

  6410 		int s = image->bits.rowstride * 4;

  6412 		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;

  6413 		iter->stride = s;

  6415 		iter->get_scanline = f->get_scanline;

  6416 		return TRUE;

  6417 	    }

  6418 	}

  6419     }

  6421     return FALSE;

  6422 }

  6424 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)

  6425 __attribute__((__force_align_arg_pointer__))

  6426 #endif

  6427 pixman_implementation_t *

  6428 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)

  6429 {

  6430     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);

  6432     /* SSE2 constants */

  6433     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);

  6434     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);

  6435     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);

  6436     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);

  6437     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);

  6438     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);

  6439     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);

  6440     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);

  6441     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);

  6442     mask_0080 = create_mask_16_128 (0x0080);

  6443     mask_00ff = create_mask_16_128 (0x00ff);

  6444     mask_0101 = create_mask_16_128 (0x0101);

  6445     mask_ffff = create_mask_16_128 (0xffff);

  6446     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);

  6447     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);

  6448     mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);

  6449     mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);

  6451     /* Set up function pointers */

  6452     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;

  6453     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;

  6454     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;

  6455     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;

  6456     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;

  6457     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;

  6458     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;

  6459     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;

  6460     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;

  6461     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;

  6463     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;

  6465     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;

  6466     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;

  6467     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;

  6468     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;

  6469     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;

  6470     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;

  6471     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;

  6472     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;

  6473     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;

  6474     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;

  6475     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;

  6477     imp->blt = sse2_blt;

  6478     imp->fill = sse2_fill;

  6480     imp->src_iter_init = sse2_src_iter_init;

  6482     return imp;

  6483 }

The Tor Browser / file revision

gfx/cairo/libpixman/src/pixman-sse2.c@b8a032363ba2

gfx/cairo/libpixman/src/pixman-sse2.c