gfx/cairo/libpixman/src/pixman-sse2.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * Copyright © 2008 Rodrigo Kumpera
     3  * Copyright © 2008 André Tupinambá
     4  *
     5  * Permission to use, copy, modify, distribute, and sell this software and its
     6  * documentation for any purpose is hereby granted without fee, provided that
     7  * the above copyright notice appear in all copies and that both that
     8  * copyright notice and this permission notice appear in supporting
     9  * documentation, and that the name of Red Hat not be used in advertising or
    10  * publicity pertaining to distribution of the software without specific,
    11  * written prior permission.  Red Hat makes no representations about the
    12  * suitability of this software for any purpose.  It is provided "as is"
    13  * without express or implied warranty.
    14  *
    15  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    16  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    17  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    18  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    19  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    20  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    21  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    22  * SOFTWARE.
    23  *
    24  * Author:  Rodrigo Kumpera (kumpera@gmail.com)
    25  *          André Tupinambá (andrelrt@gmail.com)
    26  *
    27  * Based on work by Owen Taylor and Søren Sandmann
    28  */
    29 #ifdef HAVE_CONFIG_H
    30 #include <config.h>
    31 #endif
    33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
    34 #include <emmintrin.h> /* for SSE2 intrinsics */
    35 #include "pixman-private.h"
    36 #include "pixman-combine32.h"
    37 #include "pixman-inlines.h"
    39 static __m128i mask_0080;
    40 static __m128i mask_00ff;
    41 static __m128i mask_0101;
    42 static __m128i mask_ffff;
    43 static __m128i mask_ff000000;
    44 static __m128i mask_alpha;
    46 static __m128i mask_565_r;
    47 static __m128i mask_565_g1, mask_565_g2;
    48 static __m128i mask_565_b;
    49 static __m128i mask_red;
    50 static __m128i mask_green;
    51 static __m128i mask_blue;
    53 static __m128i mask_565_fix_rb;
    54 static __m128i mask_565_fix_g;
    56 static __m128i mask_565_rb;
    57 static __m128i mask_565_pack_multiplier;
    59 static force_inline __m128i
    60 unpack_32_1x128 (uint32_t data)
    61 {
    62     return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
    63 }
    65 static force_inline void
    66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
    67 {
    68     *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
    69     *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
    70 }
    72 static force_inline __m128i
    73 unpack_565_to_8888 (__m128i lo)
    74 {
    75     __m128i r, g, b, rb, t;
    77     r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
    78     g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
    79     b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
    81     rb = _mm_or_si128 (r, b);
    82     t  = _mm_and_si128 (rb, mask_565_fix_rb);
    83     t  = _mm_srli_epi32 (t, 5);
    84     rb = _mm_or_si128 (rb, t);
    86     t  = _mm_and_si128 (g, mask_565_fix_g);
    87     t  = _mm_srli_epi32 (t, 6);
    88     g  = _mm_or_si128 (g, t);
    90     return _mm_or_si128 (rb, g);
    91 }
    93 static force_inline void
    94 unpack_565_128_4x128 (__m128i  data,
    95                       __m128i* data0,
    96                       __m128i* data1,
    97                       __m128i* data2,
    98                       __m128i* data3)
    99 {
   100     __m128i lo, hi;
   102     lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
   103     hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
   105     lo = unpack_565_to_8888 (lo);
   106     hi = unpack_565_to_8888 (hi);
   108     unpack_128_2x128 (lo, data0, data1);
   109     unpack_128_2x128 (hi, data2, data3);
   110 }
   112 static force_inline uint16_t
   113 pack_565_32_16 (uint32_t pixel)
   114 {
   115     return (uint16_t) (((pixel >> 8) & 0xf800) |
   116 		       ((pixel >> 5) & 0x07e0) |
   117 		       ((pixel >> 3) & 0x001f));
   118 }
   120 static force_inline __m128i
   121 pack_2x128_128 (__m128i lo, __m128i hi)
   122 {
   123     return _mm_packus_epi16 (lo, hi);
   124 }
   126 static force_inline __m128i
   127 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
   128 {
   129     __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
   130     __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
   132     __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
   133     __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
   135     __m128i g0 = _mm_and_si128 (lo, mask_green);
   136     __m128i g1 = _mm_and_si128 (hi, mask_green);
   138     t0 = _mm_or_si128 (t0, g0);
   139     t1 = _mm_or_si128 (t1, g1);
   141     /* Simulates _mm_packus_epi32 */
   142     t0 = _mm_slli_epi32 (t0, 16 - 5);
   143     t1 = _mm_slli_epi32 (t1, 16 - 5);
   144     t0 = _mm_srai_epi32 (t0, 16);
   145     t1 = _mm_srai_epi32 (t1, 16);
   146     return _mm_packs_epi32 (t0, t1);
   147 }
   149 static force_inline __m128i
   150 pack_565_2x128_128 (__m128i lo, __m128i hi)
   151 {
   152     __m128i data;
   153     __m128i r, g1, g2, b;
   155     data = pack_2x128_128 (lo, hi);
   157     r  = _mm_and_si128 (data, mask_565_r);
   158     g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
   159     g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
   160     b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
   162     return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
   163 }
   165 static force_inline __m128i
   166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
   167 {
   168     return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
   169 			     pack_565_2x128_128 (*xmm2, *xmm3));
   170 }
   172 static force_inline int
   173 is_opaque (__m128i x)
   174 {
   175     __m128i ffs = _mm_cmpeq_epi8 (x, x);
   177     return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
   178 }
   180 static force_inline int
   181 is_zero (__m128i x)
   182 {
   183     return _mm_movemask_epi8 (
   184 	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
   185 }
   187 static force_inline int
   188 is_transparent (__m128i x)
   189 {
   190     return (_mm_movemask_epi8 (
   191 		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
   192 }
   194 static force_inline __m128i
   195 expand_pixel_32_1x128 (uint32_t data)
   196 {
   197     return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
   198 }
   200 static force_inline __m128i
   201 expand_alpha_1x128 (__m128i data)
   202 {
   203     return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
   204 						     _MM_SHUFFLE (3, 3, 3, 3)),
   205 				_MM_SHUFFLE (3, 3, 3, 3));
   206 }
   208 static force_inline void
   209 expand_alpha_2x128 (__m128i  data_lo,
   210                     __m128i  data_hi,
   211                     __m128i* alpha_lo,
   212                     __m128i* alpha_hi)
   213 {
   214     __m128i lo, hi;
   216     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
   217     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
   219     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
   220     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
   221 }
   223 static force_inline void
   224 expand_alpha_rev_2x128 (__m128i  data_lo,
   225                         __m128i  data_hi,
   226                         __m128i* alpha_lo,
   227                         __m128i* alpha_hi)
   228 {
   229     __m128i lo, hi;
   231     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
   232     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
   233     *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
   234     *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
   235 }
   237 static force_inline void
   238 pix_multiply_2x128 (__m128i* data_lo,
   239                     __m128i* data_hi,
   240                     __m128i* alpha_lo,
   241                     __m128i* alpha_hi,
   242                     __m128i* ret_lo,
   243                     __m128i* ret_hi)
   244 {
   245     __m128i lo, hi;
   247     lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
   248     hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
   249     lo = _mm_adds_epu16 (lo, mask_0080);
   250     hi = _mm_adds_epu16 (hi, mask_0080);
   251     *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
   252     *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
   253 }
   255 static force_inline void
   256 pix_add_multiply_2x128 (__m128i* src_lo,
   257                         __m128i* src_hi,
   258                         __m128i* alpha_dst_lo,
   259                         __m128i* alpha_dst_hi,
   260                         __m128i* dst_lo,
   261                         __m128i* dst_hi,
   262                         __m128i* alpha_src_lo,
   263                         __m128i* alpha_src_hi,
   264                         __m128i* ret_lo,
   265                         __m128i* ret_hi)
   266 {
   267     __m128i t1_lo, t1_hi;
   268     __m128i t2_lo, t2_hi;
   270     pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
   271     pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
   273     *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
   274     *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
   275 }
   277 static force_inline void
   278 negate_2x128 (__m128i  data_lo,
   279               __m128i  data_hi,
   280               __m128i* neg_lo,
   281               __m128i* neg_hi)
   282 {
   283     *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
   284     *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
   285 }
   287 static force_inline void
   288 invert_colors_2x128 (__m128i  data_lo,
   289                      __m128i  data_hi,
   290                      __m128i* inv_lo,
   291                      __m128i* inv_hi)
   292 {
   293     __m128i lo, hi;
   295     lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
   296     hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
   297     *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
   298     *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
   299 }
   301 static force_inline void
   302 over_2x128 (__m128i* src_lo,
   303             __m128i* src_hi,
   304             __m128i* alpha_lo,
   305             __m128i* alpha_hi,
   306             __m128i* dst_lo,
   307             __m128i* dst_hi)
   308 {
   309     __m128i t1, t2;
   311     negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
   313     pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
   315     *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
   316     *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
   317 }
   319 static force_inline void
   320 over_rev_non_pre_2x128 (__m128i  src_lo,
   321                         __m128i  src_hi,
   322                         __m128i* dst_lo,
   323                         __m128i* dst_hi)
   324 {
   325     __m128i lo, hi;
   326     __m128i alpha_lo, alpha_hi;
   328     expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
   330     lo = _mm_or_si128 (alpha_lo, mask_alpha);
   331     hi = _mm_or_si128 (alpha_hi, mask_alpha);
   333     invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
   335     pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
   337     over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
   338 }
   340 static force_inline void
   341 in_over_2x128 (__m128i* src_lo,
   342                __m128i* src_hi,
   343                __m128i* alpha_lo,
   344                __m128i* alpha_hi,
   345                __m128i* mask_lo,
   346                __m128i* mask_hi,
   347                __m128i* dst_lo,
   348                __m128i* dst_hi)
   349 {
   350     __m128i s_lo, s_hi;
   351     __m128i a_lo, a_hi;
   353     pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
   354     pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
   356     over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
   357 }
   359 /* load 4 pixels from a 16-byte boundary aligned address */
   360 static force_inline __m128i
   361 load_128_aligned (__m128i* src)
   362 {
   363     return _mm_load_si128 (src);
   364 }
   366 /* load 4 pixels from a unaligned address */
   367 static force_inline __m128i
   368 load_128_unaligned (const __m128i* src)
   369 {
   370     return _mm_loadu_si128 (src);
   371 }
   373 /* save 4 pixels using Write Combining memory on a 16-byte
   374  * boundary aligned address
   375  */
   376 static force_inline void
   377 save_128_write_combining (__m128i* dst,
   378                           __m128i  data)
   379 {
   380     _mm_stream_si128 (dst, data);
   381 }
   383 /* save 4 pixels on a 16-byte boundary aligned address */
   384 static force_inline void
   385 save_128_aligned (__m128i* dst,
   386                   __m128i  data)
   387 {
   388     _mm_store_si128 (dst, data);
   389 }
   391 /* save 4 pixels on a unaligned address */
   392 static force_inline void
   393 save_128_unaligned (__m128i* dst,
   394                     __m128i  data)
   395 {
   396     _mm_storeu_si128 (dst, data);
   397 }
   399 static force_inline __m128i
   400 load_32_1x128 (uint32_t data)
   401 {
   402     return _mm_cvtsi32_si128 (data);
   403 }
   405 static force_inline __m128i
   406 expand_alpha_rev_1x128 (__m128i data)
   407 {
   408     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
   409 }
   411 static force_inline __m128i
   412 expand_pixel_8_1x128 (uint8_t data)
   413 {
   414     return _mm_shufflelo_epi16 (
   415 	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
   416 }
   418 static force_inline __m128i
   419 pix_multiply_1x128 (__m128i data,
   420 		    __m128i alpha)
   421 {
   422     return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
   423 					    mask_0080),
   424 			    mask_0101);
   425 }
   427 static force_inline __m128i
   428 pix_add_multiply_1x128 (__m128i* src,
   429 			__m128i* alpha_dst,
   430 			__m128i* dst,
   431 			__m128i* alpha_src)
   432 {
   433     __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
   434     __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
   436     return _mm_adds_epu8 (t1, t2);
   437 }
   439 static force_inline __m128i
   440 negate_1x128 (__m128i data)
   441 {
   442     return _mm_xor_si128 (data, mask_00ff);
   443 }
   445 static force_inline __m128i
   446 invert_colors_1x128 (__m128i data)
   447 {
   448     return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
   449 }
   451 static force_inline __m128i
   452 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
   453 {
   454     return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
   455 }
   457 static force_inline __m128i
   458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
   459 {
   460     return over_1x128 (pix_multiply_1x128 (*src, *mask),
   461 		       pix_multiply_1x128 (*alpha, *mask),
   462 		       *dst);
   463 }
   465 static force_inline __m128i
   466 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
   467 {
   468     __m128i alpha = expand_alpha_1x128 (src);
   470     return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
   471 					   _mm_or_si128 (alpha, mask_alpha)),
   472 		       alpha,
   473 		       dst);
   474 }
   476 static force_inline uint32_t
   477 pack_1x128_32 (__m128i data)
   478 {
   479     return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
   480 }
   482 static force_inline __m128i
   483 expand565_16_1x128 (uint16_t pixel)
   484 {
   485     __m128i m = _mm_cvtsi32_si128 (pixel);
   487     m = unpack_565_to_8888 (m);
   489     return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
   490 }
   492 static force_inline uint32_t
   493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
   494 {
   495     uint8_t a;
   496     __m128i xmms;
   498     a = src >> 24;
   500     if (a == 0xff)
   501     {
   502 	return src;
   503     }
   504     else if (src)
   505     {
   506 	xmms = unpack_32_1x128 (src);
   507 	return pack_1x128_32 (
   508 	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
   509 			unpack_32_1x128 (dst)));
   510     }
   512     return dst;
   513 }
   515 static force_inline uint32_t
   516 combine1 (const uint32_t *ps, const uint32_t *pm)
   517 {
   518     uint32_t s = *ps;
   520     if (pm)
   521     {
   522 	__m128i ms, mm;
   524 	mm = unpack_32_1x128 (*pm);
   525 	mm = expand_alpha_1x128 (mm);
   527 	ms = unpack_32_1x128 (s);
   528 	ms = pix_multiply_1x128 (ms, mm);
   530 	s = pack_1x128_32 (ms);
   531     }
   533     return s;
   534 }
   536 static force_inline __m128i
   537 combine4 (const __m128i *ps, const __m128i *pm)
   538 {
   539     __m128i xmm_src_lo, xmm_src_hi;
   540     __m128i xmm_msk_lo, xmm_msk_hi;
   541     __m128i s;
   543     if (pm)
   544     {
   545 	xmm_msk_lo = load_128_unaligned (pm);
   547 	if (is_transparent (xmm_msk_lo))
   548 	    return _mm_setzero_si128 ();
   549     }
   551     s = load_128_unaligned (ps);
   553     if (pm)
   554     {
   555 	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
   556 	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
   558 	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
   560 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   561 			    &xmm_msk_lo, &xmm_msk_hi,
   562 			    &xmm_src_lo, &xmm_src_hi);
   564 	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
   565     }
   567     return s;
   568 }
   570 static force_inline void
   571 core_combine_over_u_sse2_mask (uint32_t *	  pd,
   572 			       const uint32_t*    ps,
   573 			       const uint32_t*    pm,
   574 			       int                w)
   575 {
   576     uint32_t s, d;
   578     /* Align dst on a 16-byte boundary */
   579     while (w && ((uintptr_t)pd & 15))
   580     {
   581 	d = *pd;
   582 	s = combine1 (ps, pm);
   584 	if (s)
   585 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   586 	pd++;
   587 	ps++;
   588 	pm++;
   589 	w--;
   590     }
   592     while (w >= 4)
   593     {
   594 	__m128i mask = load_128_unaligned ((__m128i *)pm);
   596 	if (!is_zero (mask))
   597 	{
   598 	    __m128i src;
   599 	    __m128i src_hi, src_lo;
   600 	    __m128i mask_hi, mask_lo;
   601 	    __m128i alpha_hi, alpha_lo;
   603 	    src = load_128_unaligned ((__m128i *)ps);
   605 	    if (is_opaque (_mm_and_si128 (src, mask)))
   606 	    {
   607 		save_128_aligned ((__m128i *)pd, src);
   608 	    }
   609 	    else
   610 	    {
   611 		__m128i dst = load_128_aligned ((__m128i *)pd);
   612 		__m128i dst_hi, dst_lo;
   614 		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
   615 		unpack_128_2x128 (src, &src_lo, &src_hi);
   617 		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
   618 		pix_multiply_2x128 (&src_lo, &src_hi,
   619 				    &mask_lo, &mask_hi,
   620 				    &src_lo, &src_hi);
   622 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
   624 		expand_alpha_2x128 (src_lo, src_hi,
   625 				    &alpha_lo, &alpha_hi);
   627 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
   628 			    &dst_lo, &dst_hi);
   630 		save_128_aligned (
   631 		    (__m128i *)pd,
   632 		    pack_2x128_128 (dst_lo, dst_hi));
   633 	    }
   634 	}
   636 	pm += 4;
   637 	ps += 4;
   638 	pd += 4;
   639 	w -= 4;
   640     }
   641     while (w)
   642     {
   643 	d = *pd;
   644 	s = combine1 (ps, pm);
   646 	if (s)
   647 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   648 	pd++;
   649 	ps++;
   650 	pm++;
   652 	w--;
   653     }
   654 }
   656 static force_inline void
   657 core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
   658 				  const uint32_t*    ps,
   659 				  int                w)
   660 {
   661     uint32_t s, d;
   663     /* Align dst on a 16-byte boundary */
   664     while (w && ((uintptr_t)pd & 15))
   665     {
   666 	d = *pd;
   667 	s = *ps;
   669 	if (s)
   670 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   671 	pd++;
   672 	ps++;
   673 	w--;
   674     }
   676     while (w >= 4)
   677     {
   678 	__m128i src;
   679 	__m128i src_hi, src_lo, dst_hi, dst_lo;
   680 	__m128i alpha_hi, alpha_lo;
   682 	src = load_128_unaligned ((__m128i *)ps);
   684 	if (!is_zero (src))
   685 	{
   686 	    if (is_opaque (src))
   687 	    {
   688 		save_128_aligned ((__m128i *)pd, src);
   689 	    }
   690 	    else
   691 	    {
   692 		__m128i dst = load_128_aligned ((__m128i *)pd);
   694 		unpack_128_2x128 (src, &src_lo, &src_hi);
   695 		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
   697 		expand_alpha_2x128 (src_lo, src_hi,
   698 				    &alpha_lo, &alpha_hi);
   699 		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
   700 			    &dst_lo, &dst_hi);
   702 		save_128_aligned (
   703 		    (__m128i *)pd,
   704 		    pack_2x128_128 (dst_lo, dst_hi));
   705 	    }
   706 	}
   708 	ps += 4;
   709 	pd += 4;
   710 	w -= 4;
   711     }
   712     while (w)
   713     {
   714 	d = *pd;
   715 	s = *ps;
   717 	if (s)
   718 	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   719 	pd++;
   720 	ps++;
   722 	w--;
   723     }
   724 }
   726 static force_inline void
   727 sse2_combine_over_u (pixman_implementation_t *imp,
   728                      pixman_op_t              op,
   729                      uint32_t *               pd,
   730                      const uint32_t *         ps,
   731                      const uint32_t *         pm,
   732                      int                      w)
   733 {
   734     if (pm)
   735 	core_combine_over_u_sse2_mask (pd, ps, pm, w);
   736     else
   737 	core_combine_over_u_sse2_no_mask (pd, ps, w);
   738 }
   740 static void
   741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
   742                              pixman_op_t              op,
   743                              uint32_t *               pd,
   744                              const uint32_t *         ps,
   745                              const uint32_t *         pm,
   746                              int                      w)
   747 {
   748     uint32_t s, d;
   750     __m128i xmm_dst_lo, xmm_dst_hi;
   751     __m128i xmm_src_lo, xmm_src_hi;
   752     __m128i xmm_alpha_lo, xmm_alpha_hi;
   754     /* Align dst on a 16-byte boundary */
   755     while (w &&
   756            ((uintptr_t)pd & 15))
   757     {
   758 	d = *pd;
   759 	s = combine1 (ps, pm);
   761 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
   762 	w--;
   763 	ps++;
   764 	if (pm)
   765 	    pm++;
   766     }
   768     while (w >= 4)
   769     {
   770 	/* I'm loading unaligned because I'm not sure
   771 	 * about the address alignment.
   772 	 */
   773 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   774 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   776 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   777 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   779 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   780 			    &xmm_alpha_lo, &xmm_alpha_hi);
   782 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   783 		    &xmm_alpha_lo, &xmm_alpha_hi,
   784 		    &xmm_src_lo, &xmm_src_hi);
   786 	/* rebuid the 4 pixel data and save*/
   787 	save_128_aligned ((__m128i*)pd,
   788 			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
   790 	w -= 4;
   791 	ps += 4;
   792 	pd += 4;
   794 	if (pm)
   795 	    pm += 4;
   796     }
   798     while (w)
   799     {
   800 	d = *pd;
   801 	s = combine1 (ps, pm);
   803 	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
   804 	ps++;
   805 	w--;
   806 	if (pm)
   807 	    pm++;
   808     }
   809 }
   811 static force_inline uint32_t
   812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
   813 {
   814     uint32_t maska = src >> 24;
   816     if (maska == 0)
   817     {
   818 	return 0;
   819     }
   820     else if (maska != 0xff)
   821     {
   822 	return pack_1x128_32 (
   823 	    pix_multiply_1x128 (unpack_32_1x128 (dst),
   824 				expand_alpha_1x128 (unpack_32_1x128 (src))));
   825     }
   827     return dst;
   828 }
   830 static void
   831 sse2_combine_in_u (pixman_implementation_t *imp,
   832                    pixman_op_t              op,
   833                    uint32_t *               pd,
   834                    const uint32_t *         ps,
   835                    const uint32_t *         pm,
   836                    int                      w)
   837 {
   838     uint32_t s, d;
   840     __m128i xmm_src_lo, xmm_src_hi;
   841     __m128i xmm_dst_lo, xmm_dst_hi;
   843     while (w && ((uintptr_t)pd & 15))
   844     {
   845 	s = combine1 (ps, pm);
   846 	d = *pd;
   848 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
   849 	w--;
   850 	ps++;
   851 	if (pm)
   852 	    pm++;
   853     }
   855     while (w >= 4)
   856     {
   857 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   858 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
   860 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   861 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   863 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   864 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   865 			    &xmm_dst_lo, &xmm_dst_hi,
   866 			    &xmm_dst_lo, &xmm_dst_hi);
   868 	save_128_aligned ((__m128i*)pd,
   869 			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   871 	ps += 4;
   872 	pd += 4;
   873 	w -= 4;
   874 	if (pm)
   875 	    pm += 4;
   876     }
   878     while (w)
   879     {
   880 	s = combine1 (ps, pm);
   881 	d = *pd;
   883 	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
   884 	w--;
   885 	ps++;
   886 	if (pm)
   887 	    pm++;
   888     }
   889 }
   891 static void
   892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
   893                            pixman_op_t              op,
   894                            uint32_t *               pd,
   895                            const uint32_t *         ps,
   896                            const uint32_t *         pm,
   897                            int                      w)
   898 {
   899     uint32_t s, d;
   901     __m128i xmm_src_lo, xmm_src_hi;
   902     __m128i xmm_dst_lo, xmm_dst_hi;
   904     while (w && ((uintptr_t)pd & 15))
   905     {
   906 	s = combine1 (ps, pm);
   907 	d = *pd;
   909 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
   910 	ps++;
   911 	w--;
   912 	if (pm)
   913 	    pm++;
   914     }
   916     while (w >= 4)
   917     {
   918 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   919 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
   921 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   922 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   924 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   925 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   926 			    &xmm_src_lo, &xmm_src_hi,
   927 			    &xmm_dst_lo, &xmm_dst_hi);
   929 	save_128_aligned (
   930 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   932 	ps += 4;
   933 	pd += 4;
   934 	w -= 4;
   935 	if (pm)
   936 	    pm += 4;
   937     }
   939     while (w)
   940     {
   941 	s = combine1 (ps, pm);
   942 	d = *pd;
   944 	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
   945 	w--;
   946 	ps++;
   947 	if (pm)
   948 	    pm++;
   949     }
   950 }
   952 static void
   953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
   954                             pixman_op_t              op,
   955                             uint32_t *               pd,
   956                             const uint32_t *         ps,
   957                             const uint32_t *         pm,
   958                             int                      w)
   959 {
   960     while (w && ((uintptr_t)pd & 15))
   961     {
   962 	uint32_t s = combine1 (ps, pm);
   963 	uint32_t d = *pd;
   965 	*pd++ = pack_1x128_32 (
   966 	    pix_multiply_1x128 (
   967 		unpack_32_1x128 (d), negate_1x128 (
   968 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
   970 	if (pm)
   971 	    pm++;
   972 	ps++;
   973 	w--;
   974     }
   976     while (w >= 4)
   977     {
   978 	__m128i xmm_src_lo, xmm_src_hi;
   979 	__m128i xmm_dst_lo, xmm_dst_hi;
   981 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   982 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   984 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   985 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   987 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   988 	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   990 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   991 			    &xmm_src_lo, &xmm_src_hi,
   992 			    &xmm_dst_lo, &xmm_dst_hi);
   994 	save_128_aligned (
   995 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   997 	ps += 4;
   998 	pd += 4;
   999 	if (pm)
  1000 	    pm += 4;
  1002 	w -= 4;
  1005     while (w)
  1007 	uint32_t s = combine1 (ps, pm);
  1008 	uint32_t d = *pd;
  1010 	*pd++ = pack_1x128_32 (
  1011 	    pix_multiply_1x128 (
  1012 		unpack_32_1x128 (d), negate_1x128 (
  1013 		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
  1014 	ps++;
  1015 	if (pm)
  1016 	    pm++;
  1017 	w--;
  1021 static void
  1022 sse2_combine_out_u (pixman_implementation_t *imp,
  1023                     pixman_op_t              op,
  1024                     uint32_t *               pd,
  1025                     const uint32_t *         ps,
  1026                     const uint32_t *         pm,
  1027                     int                      w)
  1029     while (w && ((uintptr_t)pd & 15))
  1031 	uint32_t s = combine1 (ps, pm);
  1032 	uint32_t d = *pd;
  1034 	*pd++ = pack_1x128_32 (
  1035 	    pix_multiply_1x128 (
  1036 		unpack_32_1x128 (s), negate_1x128 (
  1037 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1038 	w--;
  1039 	ps++;
  1040 	if (pm)
  1041 	    pm++;
  1044     while (w >= 4)
  1046 	__m128i xmm_src_lo, xmm_src_hi;
  1047 	__m128i xmm_dst_lo, xmm_dst_hi;
  1049 	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
  1050 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1052 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1053 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1055 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1056 	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1058 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1059 			    &xmm_dst_lo, &xmm_dst_hi,
  1060 			    &xmm_dst_lo, &xmm_dst_hi);
  1062 	save_128_aligned (
  1063 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1065 	ps += 4;
  1066 	pd += 4;
  1067 	w -= 4;
  1068 	if (pm)
  1069 	    pm += 4;
  1072     while (w)
  1074 	uint32_t s = combine1 (ps, pm);
  1075 	uint32_t d = *pd;
  1077 	*pd++ = pack_1x128_32 (
  1078 	    pix_multiply_1x128 (
  1079 		unpack_32_1x128 (s), negate_1x128 (
  1080 		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1081 	w--;
  1082 	ps++;
  1083 	if (pm)
  1084 	    pm++;
  1088 static force_inline uint32_t
  1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
  1090                                 uint32_t dst)
  1092     __m128i s = unpack_32_1x128 (src);
  1093     __m128i d = unpack_32_1x128 (dst);
  1095     __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
  1096     __m128i da = expand_alpha_1x128 (d);
  1098     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
  1101 static void
  1102 sse2_combine_atop_u (pixman_implementation_t *imp,
  1103                      pixman_op_t              op,
  1104                      uint32_t *               pd,
  1105                      const uint32_t *         ps,
  1106                      const uint32_t *         pm,
  1107                      int                      w)
  1109     uint32_t s, d;
  1111     __m128i xmm_src_lo, xmm_src_hi;
  1112     __m128i xmm_dst_lo, xmm_dst_hi;
  1113     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1114     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1116     while (w && ((uintptr_t)pd & 15))
  1118 	s = combine1 (ps, pm);
  1119 	d = *pd;
  1121 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
  1122 	w--;
  1123 	ps++;
  1124 	if (pm)
  1125 	    pm++;
  1128     while (w >= 4)
  1130 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
  1131 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1133 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1134 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1136 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1137 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1138 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1139 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1141 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
  1142 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1144 	pix_add_multiply_2x128 (
  1145 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1146 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1147 	    &xmm_dst_lo, &xmm_dst_hi);
  1149 	save_128_aligned (
  1150 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1152 	ps += 4;
  1153 	pd += 4;
  1154 	w -= 4;
  1155 	if (pm)
  1156 	    pm += 4;
  1159     while (w)
  1161 	s = combine1 (ps, pm);
  1162 	d = *pd;
  1164 	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
  1165 	w--;
  1166 	ps++;
  1167 	if (pm)
  1168 	    pm++;
  1172 static force_inline uint32_t
  1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
  1174                                         uint32_t dst)
  1176     __m128i s = unpack_32_1x128 (src);
  1177     __m128i d = unpack_32_1x128 (dst);
  1179     __m128i sa = expand_alpha_1x128 (s);
  1180     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
  1182     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
  1185 static void
  1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
  1187                              pixman_op_t              op,
  1188                              uint32_t *               pd,
  1189                              const uint32_t *         ps,
  1190                              const uint32_t *         pm,
  1191                              int                      w)
  1193     uint32_t s, d;
  1195     __m128i xmm_src_lo, xmm_src_hi;
  1196     __m128i xmm_dst_lo, xmm_dst_hi;
  1197     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1198     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1200     while (w && ((uintptr_t)pd & 15))
  1202 	s = combine1 (ps, pm);
  1203 	d = *pd;
  1205 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
  1206 	ps++;
  1207 	w--;
  1208 	if (pm)
  1209 	    pm++;
  1212     while (w >= 4)
  1214 	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
  1215 	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1217 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1218 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1220 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1221 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1222 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1223 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1225 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  1226 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1228 	pix_add_multiply_2x128 (
  1229 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1230 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1231 	    &xmm_dst_lo, &xmm_dst_hi);
  1233 	save_128_aligned (
  1234 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1236 	ps += 4;
  1237 	pd += 4;
  1238 	w -= 4;
  1239 	if (pm)
  1240 	    pm += 4;
  1243     while (w)
  1245 	s = combine1 (ps, pm);
  1246 	d = *pd;
  1248 	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
  1249 	ps++;
  1250 	w--;
  1251 	if (pm)
  1252 	    pm++;
  1256 static force_inline uint32_t
  1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
  1258                                uint32_t dst)
  1260     __m128i s = unpack_32_1x128 (src);
  1261     __m128i d = unpack_32_1x128 (dst);
  1263     __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
  1264     __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
  1266     return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
  1269 static void
  1270 sse2_combine_xor_u (pixman_implementation_t *imp,
  1271                     pixman_op_t              op,
  1272                     uint32_t *               dst,
  1273                     const uint32_t *         src,
  1274                     const uint32_t *         mask,
  1275                     int                      width)
  1277     int w = width;
  1278     uint32_t s, d;
  1279     uint32_t* pd = dst;
  1280     const uint32_t* ps = src;
  1281     const uint32_t* pm = mask;
  1283     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1284     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1285     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1286     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1288     while (w && ((uintptr_t)pd & 15))
  1290 	s = combine1 (ps, pm);
  1291 	d = *pd;
  1293 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
  1294 	w--;
  1295 	ps++;
  1296 	if (pm)
  1297 	    pm++;
  1300     while (w >= 4)
  1302 	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
  1303 	xmm_dst = load_128_aligned ((__m128i*) pd);
  1305 	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1306 	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1308 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1309 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1310 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1311 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1313 	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
  1314 		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1315 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  1316 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1318 	pix_add_multiply_2x128 (
  1319 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1320 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1321 	    &xmm_dst_lo, &xmm_dst_hi);
  1323 	save_128_aligned (
  1324 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1326 	ps += 4;
  1327 	pd += 4;
  1328 	w -= 4;
  1329 	if (pm)
  1330 	    pm += 4;
  1333     while (w)
  1335 	s = combine1 (ps, pm);
  1336 	d = *pd;
  1338 	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
  1339 	w--;
  1340 	ps++;
  1341 	if (pm)
  1342 	    pm++;
  1346 static force_inline void
  1347 sse2_combine_add_u (pixman_implementation_t *imp,
  1348                     pixman_op_t              op,
  1349                     uint32_t *               dst,
  1350                     const uint32_t *         src,
  1351                     const uint32_t *         mask,
  1352                     int                      width)
  1354     int w = width;
  1355     uint32_t s, d;
  1356     uint32_t* pd = dst;
  1357     const uint32_t* ps = src;
  1358     const uint32_t* pm = mask;
  1360     while (w && (uintptr_t)pd & 15)
  1362 	s = combine1 (ps, pm);
  1363 	d = *pd;
  1365 	ps++;
  1366 	if (pm)
  1367 	    pm++;
  1368 	*pd++ = _mm_cvtsi128_si32 (
  1369 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
  1370 	w--;
  1373     while (w >= 4)
  1375 	__m128i s;
  1377 	s = combine4 ((__m128i*)ps, (__m128i*)pm);
  1379 	save_128_aligned (
  1380 	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
  1382 	pd += 4;
  1383 	ps += 4;
  1384 	if (pm)
  1385 	    pm += 4;
  1386 	w -= 4;
  1389     while (w--)
  1391 	s = combine1 (ps, pm);
  1392 	d = *pd;
  1394 	ps++;
  1395 	*pd++ = _mm_cvtsi128_si32 (
  1396 	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
  1397 	if (pm)
  1398 	    pm++;
  1402 static force_inline uint32_t
  1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
  1404                                     uint32_t dst)
  1406     __m128i ms = unpack_32_1x128 (src);
  1407     __m128i md = unpack_32_1x128 (dst);
  1408     uint32_t sa = src >> 24;
  1409     uint32_t da = ~dst >> 24;
  1411     if (sa > da)
  1413 	ms = pix_multiply_1x128 (
  1414 	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
  1417     return pack_1x128_32 (_mm_adds_epu16 (md, ms));
  1420 static void
  1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
  1422                          pixman_op_t              op,
  1423                          uint32_t *               pd,
  1424                          const uint32_t *         ps,
  1425                          const uint32_t *         pm,
  1426                          int                      w)
  1428     uint32_t s, d;
  1430     uint32_t pack_cmp;
  1431     __m128i xmm_src, xmm_dst;
  1433     while (w && (uintptr_t)pd & 15)
  1435 	s = combine1 (ps, pm);
  1436 	d = *pd;
  1438 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1439 	w--;
  1440 	ps++;
  1441 	if (pm)
  1442 	    pm++;
  1445     while (w >= 4)
  1447 	xmm_dst = load_128_aligned  ((__m128i*)pd);
  1448 	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
  1450 	pack_cmp = _mm_movemask_epi8 (
  1451 	    _mm_cmpgt_epi32 (
  1452 		_mm_srli_epi32 (xmm_src, 24),
  1453 		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
  1455 	/* if some alpha src is grater than respective ~alpha dst */
  1456 	if (pack_cmp)
  1458 	    s = combine1 (ps++, pm);
  1459 	    d = *pd;
  1460 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1461 	    if (pm)
  1462 		pm++;
  1464 	    s = combine1 (ps++, pm);
  1465 	    d = *pd;
  1466 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1467 	    if (pm)
  1468 		pm++;
  1470 	    s = combine1 (ps++, pm);
  1471 	    d = *pd;
  1472 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1473 	    if (pm)
  1474 		pm++;
  1476 	    s = combine1 (ps++, pm);
  1477 	    d = *pd;
  1478 	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1479 	    if (pm)
  1480 		pm++;
  1482 	else
  1484 	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
  1486 	    pd += 4;
  1487 	    ps += 4;
  1488 	    if (pm)
  1489 		pm += 4;
  1492 	w -= 4;
  1495     while (w--)
  1497 	s = combine1 (ps, pm);
  1498 	d = *pd;
  1500 	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1501 	ps++;
  1502 	if (pm)
  1503 	    pm++;
  1507 static void
  1508 sse2_combine_src_ca (pixman_implementation_t *imp,
  1509                      pixman_op_t              op,
  1510                      uint32_t *               pd,
  1511                      const uint32_t *         ps,
  1512                      const uint32_t *         pm,
  1513                      int                      w)
  1515     uint32_t s, m;
  1517     __m128i xmm_src_lo, xmm_src_hi;
  1518     __m128i xmm_mask_lo, xmm_mask_hi;
  1519     __m128i xmm_dst_lo, xmm_dst_hi;
  1521     while (w && (uintptr_t)pd & 15)
  1523 	s = *ps++;
  1524 	m = *pm++;
  1525 	*pd++ = pack_1x128_32 (
  1526 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
  1527 	w--;
  1530     while (w >= 4)
  1532 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1533 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1535 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1536 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1538 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1539 			    &xmm_mask_lo, &xmm_mask_hi,
  1540 			    &xmm_dst_lo, &xmm_dst_hi);
  1542 	save_128_aligned (
  1543 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1545 	ps += 4;
  1546 	pd += 4;
  1547 	pm += 4;
  1548 	w -= 4;
  1551     while (w)
  1553 	s = *ps++;
  1554 	m = *pm++;
  1555 	*pd++ = pack_1x128_32 (
  1556 	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
  1557 	w--;
  1561 static force_inline uint32_t
  1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
  1563                                  uint32_t mask,
  1564                                  uint32_t dst)
  1566     __m128i s = unpack_32_1x128 (src);
  1567     __m128i expAlpha = expand_alpha_1x128 (s);
  1568     __m128i unpk_mask = unpack_32_1x128 (mask);
  1569     __m128i unpk_dst  = unpack_32_1x128 (dst);
  1571     return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
  1574 static void
  1575 sse2_combine_over_ca (pixman_implementation_t *imp,
  1576                       pixman_op_t              op,
  1577                       uint32_t *               pd,
  1578                       const uint32_t *         ps,
  1579                       const uint32_t *         pm,
  1580                       int                      w)
  1582     uint32_t s, m, d;
  1584     __m128i xmm_alpha_lo, xmm_alpha_hi;
  1585     __m128i xmm_src_lo, xmm_src_hi;
  1586     __m128i xmm_dst_lo, xmm_dst_hi;
  1587     __m128i xmm_mask_lo, xmm_mask_hi;
  1589     while (w && (uintptr_t)pd & 15)
  1591 	s = *ps++;
  1592 	m = *pm++;
  1593 	d = *pd;
  1595 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
  1596 	w--;
  1599     while (w >= 4)
  1601 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1602 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1603 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1605 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1606 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1607 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1609 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1610 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1612 	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1613 		       &xmm_alpha_lo, &xmm_alpha_hi,
  1614 		       &xmm_mask_lo, &xmm_mask_hi,
  1615 		       &xmm_dst_lo, &xmm_dst_hi);
  1617 	save_128_aligned (
  1618 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1620 	ps += 4;
  1621 	pd += 4;
  1622 	pm += 4;
  1623 	w -= 4;
  1626     while (w)
  1628 	s = *ps++;
  1629 	m = *pm++;
  1630 	d = *pd;
  1632 	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
  1633 	w--;
  1637 static force_inline uint32_t
  1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
  1639                                          uint32_t mask,
  1640                                          uint32_t dst)
  1642     __m128i d = unpack_32_1x128 (dst);
  1644     return pack_1x128_32 (
  1645 	over_1x128 (d, expand_alpha_1x128 (d),
  1646 		    pix_multiply_1x128 (unpack_32_1x128 (src),
  1647 					unpack_32_1x128 (mask))));
  1650 static void
  1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
  1652                               pixman_op_t              op,
  1653                               uint32_t *               pd,
  1654                               const uint32_t *         ps,
  1655                               const uint32_t *         pm,
  1656                               int                      w)
  1658     uint32_t s, m, d;
  1660     __m128i xmm_alpha_lo, xmm_alpha_hi;
  1661     __m128i xmm_src_lo, xmm_src_hi;
  1662     __m128i xmm_dst_lo, xmm_dst_hi;
  1663     __m128i xmm_mask_lo, xmm_mask_hi;
  1665     while (w && (uintptr_t)pd & 15)
  1667 	s = *ps++;
  1668 	m = *pm++;
  1669 	d = *pd;
  1671 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
  1672 	w--;
  1675     while (w >= 4)
  1677 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1678 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1679 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1681 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1682 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1683 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1685 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1686 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1687 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1688 			    &xmm_mask_lo, &xmm_mask_hi,
  1689 			    &xmm_mask_lo, &xmm_mask_hi);
  1691 	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1692 		    &xmm_alpha_lo, &xmm_alpha_hi,
  1693 		    &xmm_mask_lo, &xmm_mask_hi);
  1695 	save_128_aligned (
  1696 	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
  1698 	ps += 4;
  1699 	pd += 4;
  1700 	pm += 4;
  1701 	w -= 4;
  1704     while (w)
  1706 	s = *ps++;
  1707 	m = *pm++;
  1708 	d = *pd;
  1710 	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
  1711 	w--;
  1715 static void
  1716 sse2_combine_in_ca (pixman_implementation_t *imp,
  1717                     pixman_op_t              op,
  1718                     uint32_t *               pd,
  1719                     const uint32_t *         ps,
  1720                     const uint32_t *         pm,
  1721                     int                      w)
  1723     uint32_t s, m, d;
  1725     __m128i xmm_alpha_lo, xmm_alpha_hi;
  1726     __m128i xmm_src_lo, xmm_src_hi;
  1727     __m128i xmm_dst_lo, xmm_dst_hi;
  1728     __m128i xmm_mask_lo, xmm_mask_hi;
  1730     while (w && (uintptr_t)pd & 15)
  1732 	s = *ps++;
  1733 	m = *pm++;
  1734 	d = *pd;
  1736 	*pd++ = pack_1x128_32 (
  1737 	    pix_multiply_1x128 (
  1738 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1739 		expand_alpha_1x128 (unpack_32_1x128 (d))));
  1741 	w--;
  1744     while (w >= 4)
  1746 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1747 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1748 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1750 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1751 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1752 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1754 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1755 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1757 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1758 			    &xmm_mask_lo, &xmm_mask_hi,
  1759 			    &xmm_dst_lo, &xmm_dst_hi);
  1761 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1762 			    &xmm_alpha_lo, &xmm_alpha_hi,
  1763 			    &xmm_dst_lo, &xmm_dst_hi);
  1765 	save_128_aligned (
  1766 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1768 	ps += 4;
  1769 	pd += 4;
  1770 	pm += 4;
  1771 	w -= 4;
  1774     while (w)
  1776 	s = *ps++;
  1777 	m = *pm++;
  1778 	d = *pd;
  1780 	*pd++ = pack_1x128_32 (
  1781 	    pix_multiply_1x128 (
  1782 		pix_multiply_1x128 (
  1783 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1784 		expand_alpha_1x128 (unpack_32_1x128 (d))));
  1786 	w--;
  1790 static void
  1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
  1792                             pixman_op_t              op,
  1793                             uint32_t *               pd,
  1794                             const uint32_t *         ps,
  1795                             const uint32_t *         pm,
  1796                             int                      w)
  1798     uint32_t s, m, d;
  1800     __m128i xmm_alpha_lo, xmm_alpha_hi;
  1801     __m128i xmm_src_lo, xmm_src_hi;
  1802     __m128i xmm_dst_lo, xmm_dst_hi;
  1803     __m128i xmm_mask_lo, xmm_mask_hi;
  1805     while (w && (uintptr_t)pd & 15)
  1807 	s = *ps++;
  1808 	m = *pm++;
  1809 	d = *pd;
  1811 	*pd++ = pack_1x128_32 (
  1812 	    pix_multiply_1x128 (
  1813 		unpack_32_1x128 (d),
  1814 		pix_multiply_1x128 (unpack_32_1x128 (m),
  1815 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
  1816 	w--;
  1819     while (w >= 4)
  1821 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1822 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1823 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1825 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1826 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1827 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1829 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1830 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1831 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1832 			    &xmm_alpha_lo, &xmm_alpha_hi,
  1833 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1835 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1836 			    &xmm_alpha_lo, &xmm_alpha_hi,
  1837 			    &xmm_dst_lo, &xmm_dst_hi);
  1839 	save_128_aligned (
  1840 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1842 	ps += 4;
  1843 	pd += 4;
  1844 	pm += 4;
  1845 	w -= 4;
  1848     while (w)
  1850 	s = *ps++;
  1851 	m = *pm++;
  1852 	d = *pd;
  1854 	*pd++ = pack_1x128_32 (
  1855 	    pix_multiply_1x128 (
  1856 		unpack_32_1x128 (d),
  1857 		pix_multiply_1x128 (unpack_32_1x128 (m),
  1858 				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
  1859 	w--;
  1863 static void
  1864 sse2_combine_out_ca (pixman_implementation_t *imp,
  1865                      pixman_op_t              op,
  1866                      uint32_t *               pd,
  1867                      const uint32_t *         ps,
  1868                      const uint32_t *         pm,
  1869                      int                      w)
  1871     uint32_t s, m, d;
  1873     __m128i xmm_alpha_lo, xmm_alpha_hi;
  1874     __m128i xmm_src_lo, xmm_src_hi;
  1875     __m128i xmm_dst_lo, xmm_dst_hi;
  1876     __m128i xmm_mask_lo, xmm_mask_hi;
  1878     while (w && (uintptr_t)pd & 15)
  1880 	s = *ps++;
  1881 	m = *pm++;
  1882 	d = *pd;
  1884 	*pd++ = pack_1x128_32 (
  1885 	    pix_multiply_1x128 (
  1886 		pix_multiply_1x128 (
  1887 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1888 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1889 	w--;
  1892     while (w >= 4)
  1894 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1895 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1896 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1898 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1899 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1900 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1902 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1903 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1904 	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
  1905 		      &xmm_alpha_lo, &xmm_alpha_hi);
  1907 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1908 			    &xmm_mask_lo, &xmm_mask_hi,
  1909 			    &xmm_dst_lo, &xmm_dst_hi);
  1910 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1911 			    &xmm_alpha_lo, &xmm_alpha_hi,
  1912 			    &xmm_dst_lo, &xmm_dst_hi);
  1914 	save_128_aligned (
  1915 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1917 	ps += 4;
  1918 	pd += 4;
  1919 	pm += 4;
  1920 	w -= 4;
  1923     while (w)
  1925 	s = *ps++;
  1926 	m = *pm++;
  1927 	d = *pd;
  1929 	*pd++ = pack_1x128_32 (
  1930 	    pix_multiply_1x128 (
  1931 		pix_multiply_1x128 (
  1932 		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1933 		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1935 	w--;
  1939 static void
  1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
  1941                              pixman_op_t              op,
  1942                              uint32_t *               pd,
  1943                              const uint32_t *         ps,
  1944                              const uint32_t *         pm,
  1945                              int                      w)
  1947     uint32_t s, m, d;
  1949     __m128i xmm_alpha_lo, xmm_alpha_hi;
  1950     __m128i xmm_src_lo, xmm_src_hi;
  1951     __m128i xmm_dst_lo, xmm_dst_hi;
  1952     __m128i xmm_mask_lo, xmm_mask_hi;
  1954     while (w && (uintptr_t)pd & 15)
  1956 	s = *ps++;
  1957 	m = *pm++;
  1958 	d = *pd;
  1960 	*pd++ = pack_1x128_32 (
  1961 	    pix_multiply_1x128 (
  1962 		unpack_32_1x128 (d),
  1963 		negate_1x128 (pix_multiply_1x128 (
  1964 				 unpack_32_1x128 (m),
  1965 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
  1966 	w--;
  1969     while (w >= 4)
  1971 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1972 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1973 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1975 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1976 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1977 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1979 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1980 			    &xmm_alpha_lo, &xmm_alpha_hi);
  1982 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1983 			    &xmm_alpha_lo, &xmm_alpha_hi,
  1984 			    &xmm_mask_lo, &xmm_mask_hi);
  1986 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
  1987 		      &xmm_mask_lo, &xmm_mask_hi);
  1989 	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1990 			    &xmm_mask_lo, &xmm_mask_hi,
  1991 			    &xmm_dst_lo, &xmm_dst_hi);
  1993 	save_128_aligned (
  1994 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1996 	ps += 4;
  1997 	pd += 4;
  1998 	pm += 4;
  1999 	w -= 4;
  2002     while (w)
  2004 	s = *ps++;
  2005 	m = *pm++;
  2006 	d = *pd;
  2008 	*pd++ = pack_1x128_32 (
  2009 	    pix_multiply_1x128 (
  2010 		unpack_32_1x128 (d),
  2011 		negate_1x128 (pix_multiply_1x128 (
  2012 				 unpack_32_1x128 (m),
  2013 				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
  2014 	w--;
  2018 static force_inline uint32_t
  2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
  2020                                  uint32_t mask,
  2021                                  uint32_t dst)
  2023     __m128i m = unpack_32_1x128 (mask);
  2024     __m128i s = unpack_32_1x128 (src);
  2025     __m128i d = unpack_32_1x128 (dst);
  2026     __m128i sa = expand_alpha_1x128 (s);
  2027     __m128i da = expand_alpha_1x128 (d);
  2029     s = pix_multiply_1x128 (s, m);
  2030     m = negate_1x128 (pix_multiply_1x128 (m, sa));
  2032     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
  2035 static void
  2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
  2037                       pixman_op_t              op,
  2038                       uint32_t *               pd,
  2039                       const uint32_t *         ps,
  2040                       const uint32_t *         pm,
  2041                       int                      w)
  2043     uint32_t s, m, d;
  2045     __m128i xmm_src_lo, xmm_src_hi;
  2046     __m128i xmm_dst_lo, xmm_dst_hi;
  2047     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  2048     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  2049     __m128i xmm_mask_lo, xmm_mask_hi;
  2051     while (w && (uintptr_t)pd & 15)
  2053 	s = *ps++;
  2054 	m = *pm++;
  2055 	d = *pd;
  2057 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
  2058 	w--;
  2061     while (w >= 4)
  2063 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  2064 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  2065 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  2067 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  2068 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  2069 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  2071 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  2072 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  2073 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  2074 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  2076 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  2077 			    &xmm_mask_lo, &xmm_mask_hi,
  2078 			    &xmm_src_lo, &xmm_src_hi);
  2079 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  2080 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  2081 			    &xmm_mask_lo, &xmm_mask_hi);
  2083 	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  2085 	pix_add_multiply_2x128 (
  2086 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
  2087 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  2088 	    &xmm_dst_lo, &xmm_dst_hi);
  2090 	save_128_aligned (
  2091 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  2093 	ps += 4;
  2094 	pd += 4;
  2095 	pm += 4;
  2096 	w -= 4;
  2099     while (w)
  2101 	s = *ps++;
  2102 	m = *pm++;
  2103 	d = *pd;
  2105 	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
  2106 	w--;
  2110 static force_inline uint32_t
  2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
  2112                                          uint32_t mask,
  2113                                          uint32_t dst)
  2115     __m128i m = unpack_32_1x128 (mask);
  2116     __m128i s = unpack_32_1x128 (src);
  2117     __m128i d = unpack_32_1x128 (dst);
  2119     __m128i da = negate_1x128 (expand_alpha_1x128 (d));
  2120     __m128i sa = expand_alpha_1x128 (s);
  2122     s = pix_multiply_1x128 (s, m);
  2123     m = pix_multiply_1x128 (m, sa);
  2125     return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
  2128 static void
  2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
  2130                               pixman_op_t              op,
  2131                               uint32_t *               pd,
  2132                               const uint32_t *         ps,
  2133                               const uint32_t *         pm,
  2134                               int                      w)
  2136     uint32_t s, m, d;
  2138     __m128i xmm_src_lo, xmm_src_hi;
  2139     __m128i xmm_dst_lo, xmm_dst_hi;
  2140     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  2141     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  2142     __m128i xmm_mask_lo, xmm_mask_hi;
  2144     while (w && (uintptr_t)pd & 15)
  2146 	s = *ps++;
  2147 	m = *pm++;
  2148 	d = *pd;
  2150 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
  2151 	w--;
  2154     while (w >= 4)
  2156 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  2157 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  2158 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  2160 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  2161 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  2162 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  2164 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  2165 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  2166 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  2167 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  2169 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  2170 			    &xmm_mask_lo, &xmm_mask_hi,
  2171 			    &xmm_src_lo, &xmm_src_hi);
  2172 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  2173 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  2174 			    &xmm_mask_lo, &xmm_mask_hi);
  2176 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  2177 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  2179 	pix_add_multiply_2x128 (
  2180 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
  2181 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  2182 	    &xmm_dst_lo, &xmm_dst_hi);
  2184 	save_128_aligned (
  2185 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  2187 	ps += 4;
  2188 	pd += 4;
  2189 	pm += 4;
  2190 	w -= 4;
  2193     while (w)
  2195 	s = *ps++;
  2196 	m = *pm++;
  2197 	d = *pd;
  2199 	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
  2200 	w--;
  2204 static force_inline uint32_t
  2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
  2206                                 uint32_t mask,
  2207                                 uint32_t dst)
  2209     __m128i a = unpack_32_1x128 (mask);
  2210     __m128i s = unpack_32_1x128 (src);
  2211     __m128i d = unpack_32_1x128 (dst);
  2213     __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
  2214 				       a, expand_alpha_1x128 (s)));
  2215     __m128i dest      = pix_multiply_1x128 (s, a);
  2216     __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
  2218     return pack_1x128_32 (pix_add_multiply_1x128 (&d,
  2219                                                 &alpha_dst,
  2220                                                 &dest,
  2221                                                 &alpha_src));
  2224 static void
  2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
  2226                      pixman_op_t              op,
  2227                      uint32_t *               pd,
  2228                      const uint32_t *         ps,
  2229                      const uint32_t *         pm,
  2230                      int                      w)
  2232     uint32_t s, m, d;
  2234     __m128i xmm_src_lo, xmm_src_hi;
  2235     __m128i xmm_dst_lo, xmm_dst_hi;
  2236     __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  2237     __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  2238     __m128i xmm_mask_lo, xmm_mask_hi;
  2240     while (w && (uintptr_t)pd & 15)
  2242 	s = *ps++;
  2243 	m = *pm++;
  2244 	d = *pd;
  2246 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
  2247 	w--;
  2250     while (w >= 4)
  2252 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  2253 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  2254 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  2256 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  2257 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  2258 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  2260 	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  2261 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  2262 	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  2263 			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  2265 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  2266 			    &xmm_mask_lo, &xmm_mask_hi,
  2267 			    &xmm_src_lo, &xmm_src_hi);
  2268 	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  2269 			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  2270 			    &xmm_mask_lo, &xmm_mask_hi);
  2272 	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  2273 		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  2274 	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
  2275 		      &xmm_mask_lo, &xmm_mask_hi);
  2277 	pix_add_multiply_2x128 (
  2278 	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
  2279 	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  2280 	    &xmm_dst_lo, &xmm_dst_hi);
  2282 	save_128_aligned (
  2283 	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  2285 	ps += 4;
  2286 	pd += 4;
  2287 	pm += 4;
  2288 	w -= 4;
  2291     while (w)
  2293 	s = *ps++;
  2294 	m = *pm++;
  2295 	d = *pd;
  2297 	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
  2298 	w--;
  2302 static void
  2303 sse2_combine_add_ca (pixman_implementation_t *imp,
  2304                      pixman_op_t              op,
  2305                      uint32_t *               pd,
  2306                      const uint32_t *         ps,
  2307                      const uint32_t *         pm,
  2308                      int                      w)
  2310     uint32_t s, m, d;
  2312     __m128i xmm_src_lo, xmm_src_hi;
  2313     __m128i xmm_dst_lo, xmm_dst_hi;
  2314     __m128i xmm_mask_lo, xmm_mask_hi;
  2316     while (w && (uintptr_t)pd & 15)
  2318 	s = *ps++;
  2319 	m = *pm++;
  2320 	d = *pd;
  2322 	*pd++ = pack_1x128_32 (
  2323 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
  2324 					       unpack_32_1x128 (m)),
  2325 			   unpack_32_1x128 (d)));
  2326 	w--;
  2329     while (w >= 4)
  2331 	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  2332 	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  2333 	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  2335 	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  2336 	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  2337 	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  2339 	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  2340 			    &xmm_mask_lo, &xmm_mask_hi,
  2341 			    &xmm_src_lo, &xmm_src_hi);
  2343 	save_128_aligned (
  2344 	    (__m128i*)pd, pack_2x128_128 (
  2345 		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
  2346 		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
  2348 	ps += 4;
  2349 	pd += 4;
  2350 	pm += 4;
  2351 	w -= 4;
  2354     while (w)
  2356 	s = *ps++;
  2357 	m = *pm++;
  2358 	d = *pd;
  2360 	*pd++ = pack_1x128_32 (
  2361 	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
  2362 					       unpack_32_1x128 (m)),
  2363 			   unpack_32_1x128 (d)));
  2364 	w--;
  2368 static force_inline __m128i
  2369 create_mask_16_128 (uint16_t mask)
  2371     return _mm_set1_epi16 (mask);
  2374 /* Work around a code generation bug in Sun Studio 12. */
  2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
  2376 # define create_mask_2x32_128(mask0, mask1)				\
  2377     (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
  2378 #else
  2379 static force_inline __m128i
  2380 create_mask_2x32_128 (uint32_t mask0,
  2381                       uint32_t mask1)
  2383     return _mm_set_epi32 (mask0, mask1, mask0, mask1);
  2385 #endif
  2387 static void
  2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
  2389                             pixman_composite_info_t *info)
  2391     PIXMAN_COMPOSITE_ARGS (info);
  2392     uint32_t src;
  2393     uint32_t    *dst_line, *dst, d;
  2394     int32_t w;
  2395     int dst_stride;
  2396     __m128i xmm_src, xmm_alpha;
  2397     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  2399     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  2401     if (src == 0)
  2402 	return;
  2404     PIXMAN_IMAGE_GET_LINE (
  2405 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  2407     xmm_src = expand_pixel_32_1x128 (src);
  2408     xmm_alpha = expand_alpha_1x128 (xmm_src);
  2410     while (height--)
  2412 	dst = dst_line;
  2414 	dst_line += dst_stride;
  2415 	w = width;
  2417 	while (w && (uintptr_t)dst & 15)
  2419 	    d = *dst;
  2420 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
  2421 						xmm_alpha,
  2422 						unpack_32_1x128 (d)));
  2423 	    w--;
  2426 	while (w >= 4)
  2428 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  2430 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  2432 	    over_2x128 (&xmm_src, &xmm_src,
  2433 			&xmm_alpha, &xmm_alpha,
  2434 			&xmm_dst_lo, &xmm_dst_hi);
  2436 	    /* rebuid the 4 pixel data and save*/
  2437 	    save_128_aligned (
  2438 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  2440 	    w -= 4;
  2441 	    dst += 4;
  2444 	while (w)
  2446 	    d = *dst;
  2447 	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
  2448 						xmm_alpha,
  2449 						unpack_32_1x128 (d)));
  2450 	    w--;
  2456 static void
  2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
  2458                             pixman_composite_info_t *info)
  2460     PIXMAN_COMPOSITE_ARGS (info);
  2461     uint32_t src;
  2462     uint16_t    *dst_line, *dst, d;
  2463     int32_t w;
  2464     int dst_stride;
  2465     __m128i xmm_src, xmm_alpha;
  2466     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  2468     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  2470     if (src == 0)
  2471 	return;
  2473     PIXMAN_IMAGE_GET_LINE (
  2474 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  2476     xmm_src = expand_pixel_32_1x128 (src);
  2477     xmm_alpha = expand_alpha_1x128 (xmm_src);
  2479     while (height--)
  2481 	dst = dst_line;
  2483 	dst_line += dst_stride;
  2484 	w = width;
  2486 	while (w && (uintptr_t)dst & 15)
  2488 	    d = *dst;
  2490 	    *dst++ = pack_565_32_16 (
  2491 		pack_1x128_32 (over_1x128 (xmm_src,
  2492 					   xmm_alpha,
  2493 					   expand565_16_1x128 (d))));
  2494 	    w--;
  2497 	while (w >= 8)
  2499 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  2501 	    unpack_565_128_4x128 (xmm_dst,
  2502 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  2504 	    over_2x128 (&xmm_src, &xmm_src,
  2505 			&xmm_alpha, &xmm_alpha,
  2506 			&xmm_dst0, &xmm_dst1);
  2507 	    over_2x128 (&xmm_src, &xmm_src,
  2508 			&xmm_alpha, &xmm_alpha,
  2509 			&xmm_dst2, &xmm_dst3);
  2511 	    xmm_dst = pack_565_4x128_128 (
  2512 		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  2514 	    save_128_aligned ((__m128i*)dst, xmm_dst);
  2516 	    dst += 8;
  2517 	    w -= 8;
  2520 	while (w--)
  2522 	    d = *dst;
  2523 	    *dst++ = pack_565_32_16 (
  2524 		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
  2525 					   expand565_16_1x128 (d))));
  2531 static void
  2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
  2533 				   pixman_composite_info_t *info)
  2535     PIXMAN_COMPOSITE_ARGS (info);
  2536     uint32_t src;
  2537     uint32_t    *dst_line, d;
  2538     uint32_t    *mask_line, m;
  2539     uint32_t pack_cmp;
  2540     int dst_stride, mask_stride;
  2542     __m128i xmm_src;
  2543     __m128i xmm_dst;
  2544     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  2546     __m128i mmx_src, mmx_mask, mmx_dest;
  2548     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  2550     if (src == 0)
  2551 	return;
  2553     PIXMAN_IMAGE_GET_LINE (
  2554 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  2555     PIXMAN_IMAGE_GET_LINE (
  2556 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  2558     xmm_src = _mm_unpacklo_epi8 (
  2559 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
  2560     mmx_src   = xmm_src;
  2562     while (height--)
  2564 	int w = width;
  2565 	const uint32_t *pm = (uint32_t *)mask_line;
  2566 	uint32_t *pd = (uint32_t *)dst_line;
  2568 	dst_line += dst_stride;
  2569 	mask_line += mask_stride;
  2571 	while (w && (uintptr_t)pd & 15)
  2573 	    m = *pm++;
  2575 	    if (m)
  2577 		d = *pd;
  2579 		mmx_mask = unpack_32_1x128 (m);
  2580 		mmx_dest = unpack_32_1x128 (d);
  2582 		*pd = pack_1x128_32 (
  2583 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
  2584 				   mmx_dest));
  2587 	    pd++;
  2588 	    w--;
  2591 	while (w >= 4)
  2593 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
  2595 	    pack_cmp =
  2596 		_mm_movemask_epi8 (
  2597 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  2599 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
  2600 	    if (pack_cmp != 0xffff)
  2602 		xmm_dst = load_128_aligned ((__m128i*)pd);
  2604 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  2606 		pix_multiply_2x128 (&xmm_src, &xmm_src,
  2607 				    &xmm_mask_lo, &xmm_mask_hi,
  2608 				    &xmm_mask_lo, &xmm_mask_hi);
  2609 		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
  2611 		save_128_aligned (
  2612 		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
  2615 	    pd += 4;
  2616 	    pm += 4;
  2617 	    w -= 4;
  2620 	while (w)
  2622 	    m = *pm++;
  2624 	    if (m)
  2626 		d = *pd;
  2628 		mmx_mask = unpack_32_1x128 (m);
  2629 		mmx_dest = unpack_32_1x128 (d);
  2631 		*pd = pack_1x128_32 (
  2632 		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
  2633 				   mmx_dest));
  2636 	    pd++;
  2637 	    w--;
  2643 static void
  2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
  2645                                     pixman_composite_info_t *info)
  2647     PIXMAN_COMPOSITE_ARGS (info);
  2648     uint32_t src;
  2649     uint32_t    *dst_line, d;
  2650     uint32_t    *mask_line, m;
  2651     uint32_t pack_cmp;
  2652     int dst_stride, mask_stride;
  2654     __m128i xmm_src, xmm_alpha;
  2655     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  2656     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  2658     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  2660     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  2662     if (src == 0)
  2663 	return;
  2665     PIXMAN_IMAGE_GET_LINE (
  2666 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  2667     PIXMAN_IMAGE_GET_LINE (
  2668 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  2670     xmm_src = _mm_unpacklo_epi8 (
  2671 	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
  2672     xmm_alpha = expand_alpha_1x128 (xmm_src);
  2673     mmx_src   = xmm_src;
  2674     mmx_alpha = xmm_alpha;
  2676     while (height--)
  2678 	int w = width;
  2679 	const uint32_t *pm = (uint32_t *)mask_line;
  2680 	uint32_t *pd = (uint32_t *)dst_line;
  2682 	dst_line += dst_stride;
  2683 	mask_line += mask_stride;
  2685 	while (w && (uintptr_t)pd & 15)
  2687 	    m = *pm++;
  2689 	    if (m)
  2691 		d = *pd;
  2692 		mmx_mask = unpack_32_1x128 (m);
  2693 		mmx_dest = unpack_32_1x128 (d);
  2695 		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
  2696 		                                  &mmx_alpha,
  2697 		                                  &mmx_mask,
  2698 		                                  &mmx_dest));
  2701 	    pd++;
  2702 	    w--;
  2705 	while (w >= 4)
  2707 	    xmm_mask = load_128_unaligned ((__m128i*)pm);
  2709 	    pack_cmp =
  2710 		_mm_movemask_epi8 (
  2711 		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  2713 	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
  2714 	    if (pack_cmp != 0xffff)
  2716 		xmm_dst = load_128_aligned ((__m128i*)pd);
  2718 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  2719 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  2721 		in_over_2x128 (&xmm_src, &xmm_src,
  2722 			       &xmm_alpha, &xmm_alpha,
  2723 			       &xmm_mask_lo, &xmm_mask_hi,
  2724 			       &xmm_dst_lo, &xmm_dst_hi);
  2726 		save_128_aligned (
  2727 		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  2730 	    pd += 4;
  2731 	    pm += 4;
  2732 	    w -= 4;
  2735 	while (w)
  2737 	    m = *pm++;
  2739 	    if (m)
  2741 		d = *pd;
  2742 		mmx_mask = unpack_32_1x128 (m);
  2743 		mmx_dest = unpack_32_1x128 (d);
  2745 		*pd = pack_1x128_32 (
  2746 		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
  2749 	    pd++;
  2750 	    w--;
  2756 static void
  2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
  2758                                  pixman_composite_info_t *info)
  2760     PIXMAN_COMPOSITE_ARGS (info);
  2761     uint32_t    *dst_line, *dst;
  2762     uint32_t    *src_line, *src;
  2763     uint32_t mask;
  2764     int32_t w;
  2765     int dst_stride, src_stride;
  2767     __m128i xmm_mask;
  2768     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  2769     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  2770     __m128i xmm_alpha_lo, xmm_alpha_hi;
  2772     PIXMAN_IMAGE_GET_LINE (
  2773 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  2774     PIXMAN_IMAGE_GET_LINE (
  2775 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  2777     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
  2779     xmm_mask = create_mask_16_128 (mask >> 24);
  2781     while (height--)
  2783 	dst = dst_line;
  2784 	dst_line += dst_stride;
  2785 	src = src_line;
  2786 	src_line += src_stride;
  2787 	w = width;
  2789 	while (w && (uintptr_t)dst & 15)
  2791 	    uint32_t s = *src++;
  2793 	    if (s)
  2795 		uint32_t d = *dst;
  2797 		__m128i ms = unpack_32_1x128 (s);
  2798 		__m128i alpha    = expand_alpha_1x128 (ms);
  2799 		__m128i dest     = xmm_mask;
  2800 		__m128i alpha_dst = unpack_32_1x128 (d);
  2802 		*dst = pack_1x128_32 (
  2803 		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  2805 	    dst++;
  2806 	    w--;
  2809 	while (w >= 4)
  2811 	    xmm_src = load_128_unaligned ((__m128i*)src);
  2813 	    if (!is_zero (xmm_src))
  2815 		xmm_dst = load_128_aligned ((__m128i*)dst);
  2817 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  2818 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  2819 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  2820 				    &xmm_alpha_lo, &xmm_alpha_hi);
  2822 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  2823 			       &xmm_alpha_lo, &xmm_alpha_hi,
  2824 			       &xmm_mask, &xmm_mask,
  2825 			       &xmm_dst_lo, &xmm_dst_hi);
  2827 		save_128_aligned (
  2828 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  2831 	    dst += 4;
  2832 	    src += 4;
  2833 	    w -= 4;
  2836 	while (w)
  2838 	    uint32_t s = *src++;
  2840 	    if (s)
  2842 		uint32_t d = *dst;
  2844 		__m128i ms = unpack_32_1x128 (s);
  2845 		__m128i alpha = expand_alpha_1x128 (ms);
  2846 		__m128i mask  = xmm_mask;
  2847 		__m128i dest  = unpack_32_1x128 (d);
  2849 		*dst = pack_1x128_32 (
  2850 		    in_over_1x128 (&ms, &alpha, &mask, &dest));
  2853 	    dst++;
  2854 	    w--;
  2860 static void
  2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
  2862                               pixman_composite_info_t *info)
  2864     PIXMAN_COMPOSITE_ARGS (info);
  2865     uint16_t    *dst_line, *dst;
  2866     uint32_t    *src_line, *src, s;
  2867     int dst_stride, src_stride;
  2868     int32_t w;
  2870     PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  2871     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  2873     while (height--)
  2875 	dst = dst_line;
  2876 	dst_line += dst_stride;
  2877 	src = src_line;
  2878 	src_line += src_stride;
  2879 	w = width;
  2881 	while (w && (uintptr_t)dst & 15)
  2883 	    s = *src++;
  2884 	    *dst = convert_8888_to_0565 (s);
  2885 	    dst++;
  2886 	    w--;
  2889 	while (w >= 8)
  2891 	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
  2892 	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
  2894 	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
  2896 	    w -= 8;
  2897 	    src += 8;
  2898 	    dst += 8;
  2901 	while (w)
  2903 	    s = *src++;
  2904 	    *dst = convert_8888_to_0565 (s);
  2905 	    dst++;
  2906 	    w--;
  2911 static void
  2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
  2913 			      pixman_composite_info_t *info)
  2915     PIXMAN_COMPOSITE_ARGS (info);
  2916     uint32_t    *dst_line, *dst;
  2917     uint32_t    *src_line, *src;
  2918     int32_t w;
  2919     int dst_stride, src_stride;
  2922     PIXMAN_IMAGE_GET_LINE (
  2923 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  2924     PIXMAN_IMAGE_GET_LINE (
  2925 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  2927     while (height--)
  2929 	dst = dst_line;
  2930 	dst_line += dst_stride;
  2931 	src = src_line;
  2932 	src_line += src_stride;
  2933 	w = width;
  2935 	while (w && (uintptr_t)dst & 15)
  2937 	    *dst++ = *src++ | 0xff000000;
  2938 	    w--;
  2941 	while (w >= 16)
  2943 	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
  2945 	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
  2946 	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
  2947 	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
  2948 	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
  2950 	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
  2951 	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
  2952 	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
  2953 	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
  2955 	    dst += 16;
  2956 	    src += 16;
  2957 	    w -= 16;
  2960 	while (w)
  2962 	    *dst++ = *src++ | 0xff000000;
  2963 	    w--;
  2969 static void
  2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
  2971                                  pixman_composite_info_t *info)
  2973     PIXMAN_COMPOSITE_ARGS (info);
  2974     uint32_t    *dst_line, *dst;
  2975     uint32_t    *src_line, *src;
  2976     uint32_t mask;
  2977     int dst_stride, src_stride;
  2978     int32_t w;
  2980     __m128i xmm_mask, xmm_alpha;
  2981     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  2982     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  2984     PIXMAN_IMAGE_GET_LINE (
  2985 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  2986     PIXMAN_IMAGE_GET_LINE (
  2987 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  2989     mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
  2991     xmm_mask = create_mask_16_128 (mask >> 24);
  2992     xmm_alpha = mask_00ff;
  2994     while (height--)
  2996 	dst = dst_line;
  2997 	dst_line += dst_stride;
  2998 	src = src_line;
  2999 	src_line += src_stride;
  3000 	w = width;
  3002 	while (w && (uintptr_t)dst & 15)
  3004 	    uint32_t s = (*src++) | 0xff000000;
  3005 	    uint32_t d = *dst;
  3007 	    __m128i src   = unpack_32_1x128 (s);
  3008 	    __m128i alpha = xmm_alpha;
  3009 	    __m128i mask  = xmm_mask;
  3010 	    __m128i dest  = unpack_32_1x128 (d);
  3012 	    *dst++ = pack_1x128_32 (
  3013 		in_over_1x128 (&src, &alpha, &mask, &dest));
  3015 	    w--;
  3018 	while (w >= 4)
  3020 	    xmm_src = _mm_or_si128 (
  3021 		load_128_unaligned ((__m128i*)src), mask_ff000000);
  3022 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  3024 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  3025 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  3027 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  3028 			   &xmm_alpha, &xmm_alpha,
  3029 			   &xmm_mask, &xmm_mask,
  3030 			   &xmm_dst_lo, &xmm_dst_hi);
  3032 	    save_128_aligned (
  3033 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  3035 	    dst += 4;
  3036 	    src += 4;
  3037 	    w -= 4;
  3041 	while (w)
  3043 	    uint32_t s = (*src++) | 0xff000000;
  3044 	    uint32_t d = *dst;
  3046 	    __m128i src  = unpack_32_1x128 (s);
  3047 	    __m128i alpha = xmm_alpha;
  3048 	    __m128i mask  = xmm_mask;
  3049 	    __m128i dest  = unpack_32_1x128 (d);
  3051 	    *dst++ = pack_1x128_32 (
  3052 		in_over_1x128 (&src, &alpha, &mask, &dest));
  3054 	    w--;
  3060 static void
  3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
  3062                                pixman_composite_info_t *info)
  3064     PIXMAN_COMPOSITE_ARGS (info);
  3065     int dst_stride, src_stride;
  3066     uint32_t    *dst_line, *dst;
  3067     uint32_t    *src_line, *src;
  3069     PIXMAN_IMAGE_GET_LINE (
  3070 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  3071     PIXMAN_IMAGE_GET_LINE (
  3072 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  3074     dst = dst_line;
  3075     src = src_line;
  3077     while (height--)
  3079 	sse2_combine_over_u (imp, op, dst, src, NULL, width);
  3081 	dst += dst_stride;
  3082 	src += src_stride;
  3086 static force_inline uint16_t
  3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
  3089     __m128i ms;
  3091     ms = unpack_32_1x128 (src);
  3092     return pack_565_32_16 (
  3093 	pack_1x128_32 (
  3094 	    over_1x128 (
  3095 		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
  3098 static void
  3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
  3100                                pixman_composite_info_t *info)
  3102     PIXMAN_COMPOSITE_ARGS (info);
  3103     uint16_t    *dst_line, *dst, d;
  3104     uint32_t    *src_line, *src, s;
  3105     int dst_stride, src_stride;
  3106     int32_t w;
  3108     __m128i xmm_alpha_lo, xmm_alpha_hi;
  3109     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  3110     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  3112     PIXMAN_IMAGE_GET_LINE (
  3113 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  3114     PIXMAN_IMAGE_GET_LINE (
  3115 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  3117     while (height--)
  3119 	dst = dst_line;
  3120 	src = src_line;
  3122 	dst_line += dst_stride;
  3123 	src_line += src_stride;
  3124 	w = width;
  3126 	/* Align dst on a 16-byte boundary */
  3127 	while (w &&
  3128 	       ((uintptr_t)dst & 15))
  3130 	    s = *src++;
  3131 	    d = *dst;
  3133 	    *dst++ = composite_over_8888_0565pixel (s, d);
  3134 	    w--;
  3137 	/* It's a 8 pixel loop */
  3138 	while (w >= 8)
  3140 	    /* I'm loading unaligned because I'm not sure
  3141 	     * about the address alignment.
  3142 	     */
  3143 	    xmm_src = load_128_unaligned ((__m128i*) src);
  3144 	    xmm_dst = load_128_aligned ((__m128i*) dst);
  3146 	    /* Unpacking */
  3147 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  3148 	    unpack_565_128_4x128 (xmm_dst,
  3149 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  3150 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  3151 				&xmm_alpha_lo, &xmm_alpha_hi);
  3153 	    /* I'm loading next 4 pixels from memory
  3154 	     * before to optimze the memory read.
  3155 	     */
  3156 	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
  3158 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
  3159 			&xmm_alpha_lo, &xmm_alpha_hi,
  3160 			&xmm_dst0, &xmm_dst1);
  3162 	    /* Unpacking */
  3163 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  3164 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  3165 				&xmm_alpha_lo, &xmm_alpha_hi);
  3167 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
  3168 			&xmm_alpha_lo, &xmm_alpha_hi,
  3169 			&xmm_dst2, &xmm_dst3);
  3171 	    save_128_aligned (
  3172 		(__m128i*)dst, pack_565_4x128_128 (
  3173 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  3175 	    w -= 8;
  3176 	    dst += 8;
  3177 	    src += 8;
  3180 	while (w--)
  3182 	    s = *src++;
  3183 	    d = *dst;
  3185 	    *dst++ = composite_over_8888_0565pixel (s, d);
  3191 static void
  3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
  3193                               pixman_composite_info_t *info)
  3195     PIXMAN_COMPOSITE_ARGS (info);
  3196     uint32_t src, srca;
  3197     uint32_t *dst_line, *dst;
  3198     uint8_t *mask_line, *mask;
  3199     int dst_stride, mask_stride;
  3200     int32_t w;
  3201     uint32_t m, d;
  3203     __m128i xmm_src, xmm_alpha, xmm_def;
  3204     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  3205     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  3207     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  3209     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  3211     srca = src >> 24;
  3212     if (src == 0)
  3213 	return;
  3215     PIXMAN_IMAGE_GET_LINE (
  3216 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  3217     PIXMAN_IMAGE_GET_LINE (
  3218 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  3220     xmm_def = create_mask_2x32_128 (src, src);
  3221     xmm_src = expand_pixel_32_1x128 (src);
  3222     xmm_alpha = expand_alpha_1x128 (xmm_src);
  3223     mmx_src   = xmm_src;
  3224     mmx_alpha = xmm_alpha;
  3226     while (height--)
  3228 	dst = dst_line;
  3229 	dst_line += dst_stride;
  3230 	mask = mask_line;
  3231 	mask_line += mask_stride;
  3232 	w = width;
  3234 	while (w && (uintptr_t)dst & 15)
  3236 	    uint8_t m = *mask++;
  3238 	    if (m)
  3240 		d = *dst;
  3241 		mmx_mask = expand_pixel_8_1x128 (m);
  3242 		mmx_dest = unpack_32_1x128 (d);
  3244 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
  3245 		                                   &mmx_alpha,
  3246 		                                   &mmx_mask,
  3247 		                                   &mmx_dest));
  3250 	    w--;
  3251 	    dst++;
  3254 	while (w >= 4)
  3256 	    m = *((uint32_t*)mask);
  3258 	    if (srca == 0xff && m == 0xffffffff)
  3260 		save_128_aligned ((__m128i*)dst, xmm_def);
  3262 	    else if (m)
  3264 		xmm_dst = load_128_aligned ((__m128i*) dst);
  3265 		xmm_mask = unpack_32_1x128 (m);
  3266 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  3268 		/* Unpacking */
  3269 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  3270 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  3272 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  3273 					&xmm_mask_lo, &xmm_mask_hi);
  3275 		in_over_2x128 (&xmm_src, &xmm_src,
  3276 			       &xmm_alpha, &xmm_alpha,
  3277 			       &xmm_mask_lo, &xmm_mask_hi,
  3278 			       &xmm_dst_lo, &xmm_dst_hi);
  3280 		save_128_aligned (
  3281 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  3284 	    w -= 4;
  3285 	    dst += 4;
  3286 	    mask += 4;
  3289 	while (w)
  3291 	    uint8_t m = *mask++;
  3293 	    if (m)
  3295 		d = *dst;
  3296 		mmx_mask = expand_pixel_8_1x128 (m);
  3297 		mmx_dest = unpack_32_1x128 (d);
  3299 		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
  3300 		                                   &mmx_alpha,
  3301 		                                   &mmx_mask,
  3302 		                                   &mmx_dest));
  3305 	    w--;
  3306 	    dst++;
  3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
  3313 __attribute__((__force_align_arg_pointer__))
  3314 #endif
  3315 static pixman_bool_t
  3316 sse2_fill (pixman_implementation_t *imp,
  3317            uint32_t *               bits,
  3318            int                      stride,
  3319            int                      bpp,
  3320            int                      x,
  3321            int                      y,
  3322            int                      width,
  3323            int                      height,
  3324            uint32_t		    filler)
  3326     uint32_t byte_width;
  3327     uint8_t *byte_line;
  3329     __m128i xmm_def;
  3331     if (bpp == 8)
  3333 	uint8_t b;
  3334 	uint16_t w;
  3336 	stride = stride * (int) sizeof (uint32_t) / 1;
  3337 	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
  3338 	byte_width = width;
  3339 	stride *= 1;
  3341 	b = filler & 0xff;
  3342 	w = (b << 8) | b;
  3343 	filler = (w << 16) | w;
  3345     else if (bpp == 16)
  3347 	stride = stride * (int) sizeof (uint32_t) / 2;
  3348 	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
  3349 	byte_width = 2 * width;
  3350 	stride *= 2;
  3352         filler = (filler & 0xffff) * 0x00010001;
  3354     else if (bpp == 32)
  3356 	stride = stride * (int) sizeof (uint32_t) / 4;
  3357 	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
  3358 	byte_width = 4 * width;
  3359 	stride *= 4;
  3361     else
  3363 	return FALSE;
  3366     xmm_def = create_mask_2x32_128 (filler, filler);
  3368     while (height--)
  3370 	int w;
  3371 	uint8_t *d = byte_line;
  3372 	byte_line += stride;
  3373 	w = byte_width;
  3375 	if (w >= 1 && ((uintptr_t)d & 1))
  3377 	    *(uint8_t *)d = filler;
  3378 	    w -= 1;
  3379 	    d += 1;
  3382 	while (w >= 2 && ((uintptr_t)d & 3))
  3384 	    *(uint16_t *)d = filler;
  3385 	    w -= 2;
  3386 	    d += 2;
  3389 	while (w >= 4 && ((uintptr_t)d & 15))
  3391 	    *(uint32_t *)d = filler;
  3393 	    w -= 4;
  3394 	    d += 4;
  3397 	while (w >= 128)
  3399 	    save_128_aligned ((__m128i*)(d),     xmm_def);
  3400 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
  3401 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
  3402 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
  3403 	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
  3404 	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
  3405 	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
  3406 	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
  3408 	    d += 128;
  3409 	    w -= 128;
  3412 	if (w >= 64)
  3414 	    save_128_aligned ((__m128i*)(d),     xmm_def);
  3415 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
  3416 	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
  3417 	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
  3419 	    d += 64;
  3420 	    w -= 64;
  3423 	if (w >= 32)
  3425 	    save_128_aligned ((__m128i*)(d),     xmm_def);
  3426 	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
  3428 	    d += 32;
  3429 	    w -= 32;
  3432 	if (w >= 16)
  3434 	    save_128_aligned ((__m128i*)(d),     xmm_def);
  3436 	    d += 16;
  3437 	    w -= 16;
  3440 	while (w >= 4)
  3442 	    *(uint32_t *)d = filler;
  3444 	    w -= 4;
  3445 	    d += 4;
  3448 	if (w >= 2)
  3450 	    *(uint16_t *)d = filler;
  3451 	    w -= 2;
  3452 	    d += 2;
  3455 	if (w >= 1)
  3457 	    *(uint8_t *)d = filler;
  3458 	    w -= 1;
  3459 	    d += 1;
  3463     return TRUE;
  3466 static void
  3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
  3468                              pixman_composite_info_t *info)
  3470     PIXMAN_COMPOSITE_ARGS (info);
  3471     uint32_t src, srca;
  3472     uint32_t    *dst_line, *dst;
  3473     uint8_t     *mask_line, *mask;
  3474     int dst_stride, mask_stride;
  3475     int32_t w;
  3476     uint32_t m;
  3478     __m128i xmm_src, xmm_def;
  3479     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  3481     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  3483     srca = src >> 24;
  3484     if (src == 0)
  3486 	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
  3487 		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
  3488 		   dest_x, dest_y, width, height, 0);
  3489 	return;
  3492     PIXMAN_IMAGE_GET_LINE (
  3493 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  3494     PIXMAN_IMAGE_GET_LINE (
  3495 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  3497     xmm_def = create_mask_2x32_128 (src, src);
  3498     xmm_src = expand_pixel_32_1x128 (src);
  3500     while (height--)
  3502 	dst = dst_line;
  3503 	dst_line += dst_stride;
  3504 	mask = mask_line;
  3505 	mask_line += mask_stride;
  3506 	w = width;
  3508 	while (w && (uintptr_t)dst & 15)
  3510 	    uint8_t m = *mask++;
  3512 	    if (m)
  3514 		*dst = pack_1x128_32 (
  3515 		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
  3517 	    else
  3519 		*dst = 0;
  3522 	    w--;
  3523 	    dst++;
  3526 	while (w >= 4)
  3528 	    m = *((uint32_t*)mask);
  3530 	    if (srca == 0xff && m == 0xffffffff)
  3532 		save_128_aligned ((__m128i*)dst, xmm_def);
  3534 	    else if (m)
  3536 		xmm_mask = unpack_32_1x128 (m);
  3537 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  3539 		/* Unpacking */
  3540 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  3542 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  3543 					&xmm_mask_lo, &xmm_mask_hi);
  3545 		pix_multiply_2x128 (&xmm_src, &xmm_src,
  3546 				    &xmm_mask_lo, &xmm_mask_hi,
  3547 				    &xmm_mask_lo, &xmm_mask_hi);
  3549 		save_128_aligned (
  3550 		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
  3552 	    else
  3554 		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
  3557 	    w -= 4;
  3558 	    dst += 4;
  3559 	    mask += 4;
  3562 	while (w)
  3564 	    uint8_t m = *mask++;
  3566 	    if (m)
  3568 		*dst = pack_1x128_32 (
  3569 		    pix_multiply_1x128 (
  3570 			xmm_src, expand_pixel_8_1x128 (m)));
  3572 	    else
  3574 		*dst = 0;
  3577 	    w--;
  3578 	    dst++;
  3584 static void
  3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
  3586                               pixman_composite_info_t *info)
  3588     PIXMAN_COMPOSITE_ARGS (info);
  3589     uint32_t src;
  3590     uint16_t    *dst_line, *dst, d;
  3591     uint8_t     *mask_line, *mask;
  3592     int dst_stride, mask_stride;
  3593     int32_t w;
  3594     uint32_t m;
  3595     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  3597     __m128i xmm_src, xmm_alpha;
  3598     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  3599     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  3601     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  3603     if (src == 0)
  3604 	return;
  3606     PIXMAN_IMAGE_GET_LINE (
  3607 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  3608     PIXMAN_IMAGE_GET_LINE (
  3609 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  3611     xmm_src = expand_pixel_32_1x128 (src);
  3612     xmm_alpha = expand_alpha_1x128 (xmm_src);
  3613     mmx_src = xmm_src;
  3614     mmx_alpha = xmm_alpha;
  3616     while (height--)
  3618 	dst = dst_line;
  3619 	dst_line += dst_stride;
  3620 	mask = mask_line;
  3621 	mask_line += mask_stride;
  3622 	w = width;
  3624 	while (w && (uintptr_t)dst & 15)
  3626 	    m = *mask++;
  3628 	    if (m)
  3630 		d = *dst;
  3631 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  3632 		mmx_dest = expand565_16_1x128 (d);
  3634 		*dst = pack_565_32_16 (
  3635 		    pack_1x128_32 (
  3636 			in_over_1x128 (
  3637 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  3640 	    w--;
  3641 	    dst++;
  3644 	while (w >= 8)
  3646 	    xmm_dst = load_128_aligned ((__m128i*) dst);
  3647 	    unpack_565_128_4x128 (xmm_dst,
  3648 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  3650 	    m = *((uint32_t*)mask);
  3651 	    mask += 4;
  3653 	    if (m)
  3655 		xmm_mask = unpack_32_1x128 (m);
  3656 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  3658 		/* Unpacking */
  3659 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  3661 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  3662 					&xmm_mask_lo, &xmm_mask_hi);
  3664 		in_over_2x128 (&xmm_src, &xmm_src,
  3665 			       &xmm_alpha, &xmm_alpha,
  3666 			       &xmm_mask_lo, &xmm_mask_hi,
  3667 			       &xmm_dst0, &xmm_dst1);
  3670 	    m = *((uint32_t*)mask);
  3671 	    mask += 4;
  3673 	    if (m)
  3675 		xmm_mask = unpack_32_1x128 (m);
  3676 		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  3678 		/* Unpacking */
  3679 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  3681 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  3682 					&xmm_mask_lo, &xmm_mask_hi);
  3683 		in_over_2x128 (&xmm_src, &xmm_src,
  3684 			       &xmm_alpha, &xmm_alpha,
  3685 			       &xmm_mask_lo, &xmm_mask_hi,
  3686 			       &xmm_dst2, &xmm_dst3);
  3689 	    save_128_aligned (
  3690 		(__m128i*)dst, pack_565_4x128_128 (
  3691 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  3693 	    w -= 8;
  3694 	    dst += 8;
  3697 	while (w)
  3699 	    m = *mask++;
  3701 	    if (m)
  3703 		d = *dst;
  3704 		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  3705 		mmx_dest = expand565_16_1x128 (d);
  3707 		*dst = pack_565_32_16 (
  3708 		    pack_1x128_32 (
  3709 			in_over_1x128 (
  3710 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  3713 	    w--;
  3714 	    dst++;
  3720 static void
  3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
  3722                                  pixman_composite_info_t *info)
  3724     PIXMAN_COMPOSITE_ARGS (info);
  3725     uint16_t    *dst_line, *dst, d;
  3726     uint32_t    *src_line, *src, s;
  3727     int dst_stride, src_stride;
  3728     int32_t w;
  3729     uint32_t opaque, zero;
  3731     __m128i ms;
  3732     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  3733     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  3735     PIXMAN_IMAGE_GET_LINE (
  3736 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  3737     PIXMAN_IMAGE_GET_LINE (
  3738 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  3740     while (height--)
  3742 	dst = dst_line;
  3743 	dst_line += dst_stride;
  3744 	src = src_line;
  3745 	src_line += src_stride;
  3746 	w = width;
  3748 	while (w && (uintptr_t)dst & 15)
  3750 	    s = *src++;
  3751 	    d = *dst;
  3753 	    ms = unpack_32_1x128 (s);
  3755 	    *dst++ = pack_565_32_16 (
  3756 		pack_1x128_32 (
  3757 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
  3758 	    w--;
  3761 	while (w >= 8)
  3763 	    /* First round */
  3764 	    xmm_src = load_128_unaligned ((__m128i*)src);
  3765 	    xmm_dst = load_128_aligned  ((__m128i*)dst);
  3767 	    opaque = is_opaque (xmm_src);
  3768 	    zero = is_zero (xmm_src);
  3770 	    unpack_565_128_4x128 (xmm_dst,
  3771 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  3772 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  3774 	    /* preload next round*/
  3775 	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
  3777 	    if (opaque)
  3779 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
  3780 				     &xmm_dst0, &xmm_dst1);
  3782 	    else if (!zero)
  3784 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
  3785 					&xmm_dst0, &xmm_dst1);
  3788 	    /* Second round */
  3789 	    opaque = is_opaque (xmm_src);
  3790 	    zero = is_zero (xmm_src);
  3792 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  3794 	    if (opaque)
  3796 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
  3797 				     &xmm_dst2, &xmm_dst3);
  3799 	    else if (!zero)
  3801 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
  3802 					&xmm_dst2, &xmm_dst3);
  3805 	    save_128_aligned (
  3806 		(__m128i*)dst, pack_565_4x128_128 (
  3807 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  3809 	    w -= 8;
  3810 	    src += 8;
  3811 	    dst += 8;
  3814 	while (w)
  3816 	    s = *src++;
  3817 	    d = *dst;
  3819 	    ms = unpack_32_1x128 (s);
  3821 	    *dst++ = pack_565_32_16 (
  3822 		pack_1x128_32 (
  3823 		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
  3824 	    w--;
  3830 static void
  3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
  3832                                  pixman_composite_info_t *info)
  3834     PIXMAN_COMPOSITE_ARGS (info);
  3835     uint32_t    *dst_line, *dst, d;
  3836     uint32_t    *src_line, *src, s;
  3837     int dst_stride, src_stride;
  3838     int32_t w;
  3839     uint32_t opaque, zero;
  3841     __m128i xmm_src_lo, xmm_src_hi;
  3842     __m128i xmm_dst_lo, xmm_dst_hi;
  3844     PIXMAN_IMAGE_GET_LINE (
  3845 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  3846     PIXMAN_IMAGE_GET_LINE (
  3847 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  3849     while (height--)
  3851 	dst = dst_line;
  3852 	dst_line += dst_stride;
  3853 	src = src_line;
  3854 	src_line += src_stride;
  3855 	w = width;
  3857 	while (w && (uintptr_t)dst & 15)
  3859 	    s = *src++;
  3860 	    d = *dst;
  3862 	    *dst++ = pack_1x128_32 (
  3863 		over_rev_non_pre_1x128 (
  3864 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
  3866 	    w--;
  3869 	while (w >= 4)
  3871 	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
  3873 	    opaque = is_opaque (xmm_src_hi);
  3874 	    zero = is_zero (xmm_src_hi);
  3876 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  3878 	    if (opaque)
  3880 		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
  3881 				     &xmm_dst_lo, &xmm_dst_hi);
  3883 		save_128_aligned (
  3884 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  3886 	    else if (!zero)
  3888 		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
  3890 		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  3892 		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
  3893 					&xmm_dst_lo, &xmm_dst_hi);
  3895 		save_128_aligned (
  3896 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  3899 	    w -= 4;
  3900 	    dst += 4;
  3901 	    src += 4;
  3904 	while (w)
  3906 	    s = *src++;
  3907 	    d = *dst;
  3909 	    *dst++ = pack_1x128_32 (
  3910 		over_rev_non_pre_1x128 (
  3911 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
  3913 	    w--;
  3919 static void
  3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
  3921                                     pixman_composite_info_t *info)
  3923     PIXMAN_COMPOSITE_ARGS (info);
  3924     uint32_t src;
  3925     uint16_t    *dst_line, *dst, d;
  3926     uint32_t    *mask_line, *mask, m;
  3927     int dst_stride, mask_stride;
  3928     int w;
  3929     uint32_t pack_cmp;
  3931     __m128i xmm_src, xmm_alpha;
  3932     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  3933     __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  3935     __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  3937     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  3939     if (src == 0)
  3940 	return;
  3942     PIXMAN_IMAGE_GET_LINE (
  3943 	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  3944     PIXMAN_IMAGE_GET_LINE (
  3945 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  3947     xmm_src = expand_pixel_32_1x128 (src);
  3948     xmm_alpha = expand_alpha_1x128 (xmm_src);
  3949     mmx_src = xmm_src;
  3950     mmx_alpha = xmm_alpha;
  3952     while (height--)
  3954 	w = width;
  3955 	mask = mask_line;
  3956 	dst = dst_line;
  3957 	mask_line += mask_stride;
  3958 	dst_line += dst_stride;
  3960 	while (w && ((uintptr_t)dst & 15))
  3962 	    m = *(uint32_t *) mask;
  3964 	    if (m)
  3966 		d = *dst;
  3967 		mmx_mask = unpack_32_1x128 (m);
  3968 		mmx_dest = expand565_16_1x128 (d);
  3970 		*dst = pack_565_32_16 (
  3971 		    pack_1x128_32 (
  3972 			in_over_1x128 (
  3973 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  3976 	    w--;
  3977 	    dst++;
  3978 	    mask++;
  3981 	while (w >= 8)
  3983 	    /* First round */
  3984 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  3985 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  3987 	    pack_cmp = _mm_movemask_epi8 (
  3988 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  3990 	    unpack_565_128_4x128 (xmm_dst,
  3991 				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  3992 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  3994 	    /* preload next round */
  3995 	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
  3997 	    /* preload next round */
  3998 	    if (pack_cmp != 0xffff)
  4000 		in_over_2x128 (&xmm_src, &xmm_src,
  4001 			       &xmm_alpha, &xmm_alpha,
  4002 			       &xmm_mask_lo, &xmm_mask_hi,
  4003 			       &xmm_dst0, &xmm_dst1);
  4006 	    /* Second round */
  4007 	    pack_cmp = _mm_movemask_epi8 (
  4008 		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  4010 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  4012 	    if (pack_cmp != 0xffff)
  4014 		in_over_2x128 (&xmm_src, &xmm_src,
  4015 			       &xmm_alpha, &xmm_alpha,
  4016 			       &xmm_mask_lo, &xmm_mask_hi,
  4017 			       &xmm_dst2, &xmm_dst3);
  4020 	    save_128_aligned (
  4021 		(__m128i*)dst, pack_565_4x128_128 (
  4022 		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  4024 	    w -= 8;
  4025 	    dst += 8;
  4026 	    mask += 8;
  4029 	while (w)
  4031 	    m = *(uint32_t *) mask;
  4033 	    if (m)
  4035 		d = *dst;
  4036 		mmx_mask = unpack_32_1x128 (m);
  4037 		mmx_dest = expand565_16_1x128 (d);
  4039 		*dst = pack_565_32_16 (
  4040 		    pack_1x128_32 (
  4041 			in_over_1x128 (
  4042 			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  4045 	    w--;
  4046 	    dst++;
  4047 	    mask++;
  4053 static void
  4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
  4055                          pixman_composite_info_t *info)
  4057     PIXMAN_COMPOSITE_ARGS (info);
  4058     uint8_t     *dst_line, *dst;
  4059     uint8_t     *mask_line, *mask;
  4060     int dst_stride, mask_stride;
  4061     uint32_t d, m;
  4062     uint32_t src;
  4063     int32_t w;
  4065     __m128i xmm_alpha;
  4066     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  4067     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  4069     PIXMAN_IMAGE_GET_LINE (
  4070 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  4071     PIXMAN_IMAGE_GET_LINE (
  4072 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  4074     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  4076     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
  4078     while (height--)
  4080 	dst = dst_line;
  4081 	dst_line += dst_stride;
  4082 	mask = mask_line;
  4083 	mask_line += mask_stride;
  4084 	w = width;
  4086 	while (w && ((uintptr_t)dst & 15))
  4088 	    m = (uint32_t) *mask++;
  4089 	    d = (uint32_t) *dst;
  4091 	    *dst++ = (uint8_t) pack_1x128_32 (
  4092 		pix_multiply_1x128 (
  4093 		    pix_multiply_1x128 (xmm_alpha,
  4094 				       unpack_32_1x128 (m)),
  4095 		    unpack_32_1x128 (d)));
  4096 	    w--;
  4099 	while (w >= 16)
  4101 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  4102 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  4104 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  4105 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  4107 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
  4108 				&xmm_mask_lo, &xmm_mask_hi,
  4109 				&xmm_mask_lo, &xmm_mask_hi);
  4111 	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  4112 				&xmm_dst_lo, &xmm_dst_hi,
  4113 				&xmm_dst_lo, &xmm_dst_hi);
  4115 	    save_128_aligned (
  4116 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  4118 	    mask += 16;
  4119 	    dst += 16;
  4120 	    w -= 16;
  4123 	while (w)
  4125 	    m = (uint32_t) *mask++;
  4126 	    d = (uint32_t) *dst;
  4128 	    *dst++ = (uint8_t) pack_1x128_32 (
  4129 		pix_multiply_1x128 (
  4130 		    pix_multiply_1x128 (
  4131 			xmm_alpha, unpack_32_1x128 (m)),
  4132 		    unpack_32_1x128 (d)));
  4133 	    w--;
  4139 static void
  4140 sse2_composite_in_n_8 (pixman_implementation_t *imp,
  4141 		       pixman_composite_info_t *info)
  4143     PIXMAN_COMPOSITE_ARGS (info);
  4144     uint8_t     *dst_line, *dst;
  4145     int dst_stride;
  4146     uint32_t d;
  4147     uint32_t src;
  4148     int32_t w;
  4150     __m128i xmm_alpha;
  4151     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  4153     PIXMAN_IMAGE_GET_LINE (
  4154 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  4156     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  4158     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
  4160     src = src >> 24;
  4162     if (src == 0xff)
  4163 	return;
  4165     if (src == 0x00)
  4167 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
  4168 		     8, dest_x, dest_y, width, height, src);
  4170 	return;
  4173     while (height--)
  4175 	dst = dst_line;
  4176 	dst_line += dst_stride;
  4177 	w = width;
  4179 	while (w && ((uintptr_t)dst & 15))
  4181 	    d = (uint32_t) *dst;
  4183 	    *dst++ = (uint8_t) pack_1x128_32 (
  4184 		pix_multiply_1x128 (
  4185 		    xmm_alpha,
  4186 		    unpack_32_1x128 (d)));
  4187 	    w--;
  4190 	while (w >= 16)
  4192 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  4194 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  4196 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
  4197 				&xmm_dst_lo, &xmm_dst_hi,
  4198 				&xmm_dst_lo, &xmm_dst_hi);
  4200 	    save_128_aligned (
  4201 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  4203 	    dst += 16;
  4204 	    w -= 16;
  4207 	while (w)
  4209 	    d = (uint32_t) *dst;
  4211 	    *dst++ = (uint8_t) pack_1x128_32 (
  4212 		pix_multiply_1x128 (
  4213 		    xmm_alpha,
  4214 		    unpack_32_1x128 (d)));
  4215 	    w--;
  4221 static void
  4222 sse2_composite_in_8_8 (pixman_implementation_t *imp,
  4223                        pixman_composite_info_t *info)
  4225     PIXMAN_COMPOSITE_ARGS (info);
  4226     uint8_t     *dst_line, *dst;
  4227     uint8_t     *src_line, *src;
  4228     int src_stride, dst_stride;
  4229     int32_t w;
  4230     uint32_t s, d;
  4232     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  4233     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  4235     PIXMAN_IMAGE_GET_LINE (
  4236 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  4237     PIXMAN_IMAGE_GET_LINE (
  4238 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  4240     while (height--)
  4242 	dst = dst_line;
  4243 	dst_line += dst_stride;
  4244 	src = src_line;
  4245 	src_line += src_stride;
  4246 	w = width;
  4248 	while (w && ((uintptr_t)dst & 15))
  4250 	    s = (uint32_t) *src++;
  4251 	    d = (uint32_t) *dst;
  4253 	    *dst++ = (uint8_t) pack_1x128_32 (
  4254 		pix_multiply_1x128 (
  4255 		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
  4256 	    w--;
  4259 	while (w >= 16)
  4261 	    xmm_src = load_128_unaligned ((__m128i*)src);
  4262 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  4264 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  4265 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  4267 	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  4268 				&xmm_dst_lo, &xmm_dst_hi,
  4269 				&xmm_dst_lo, &xmm_dst_hi);
  4271 	    save_128_aligned (
  4272 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  4274 	    src += 16;
  4275 	    dst += 16;
  4276 	    w -= 16;
  4279 	while (w)
  4281 	    s = (uint32_t) *src++;
  4282 	    d = (uint32_t) *dst;
  4284 	    *dst++ = (uint8_t) pack_1x128_32 (
  4285 		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
  4286 	    w--;
  4292 static void
  4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
  4294 			  pixman_composite_info_t *info)
  4296     PIXMAN_COMPOSITE_ARGS (info);
  4297     uint8_t     *dst_line, *dst;
  4298     uint8_t     *mask_line, *mask;
  4299     int dst_stride, mask_stride;
  4300     int32_t w;
  4301     uint32_t src;
  4302     uint32_t m, d;
  4304     __m128i xmm_alpha;
  4305     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  4306     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  4308     PIXMAN_IMAGE_GET_LINE (
  4309 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  4310     PIXMAN_IMAGE_GET_LINE (
  4311 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  4313     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  4315     xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
  4317     while (height--)
  4319 	dst = dst_line;
  4320 	dst_line += dst_stride;
  4321 	mask = mask_line;
  4322 	mask_line += mask_stride;
  4323 	w = width;
  4325 	while (w && ((uintptr_t)dst & 15))
  4327 	    m = (uint32_t) *mask++;
  4328 	    d = (uint32_t) *dst;
  4330 	    *dst++ = (uint8_t) pack_1x128_32 (
  4331 		_mm_adds_epu16 (
  4332 		    pix_multiply_1x128 (
  4333 			xmm_alpha, unpack_32_1x128 (m)),
  4334 		    unpack_32_1x128 (d)));
  4335 	    w--;
  4338 	while (w >= 16)
  4340 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  4341 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  4343 	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  4344 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  4346 	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
  4347 				&xmm_mask_lo, &xmm_mask_hi,
  4348 				&xmm_mask_lo, &xmm_mask_hi);
  4350 	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
  4351 	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
  4353 	    save_128_aligned (
  4354 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  4356 	    mask += 16;
  4357 	    dst += 16;
  4358 	    w -= 16;
  4361 	while (w)
  4363 	    m = (uint32_t) *mask++;
  4364 	    d = (uint32_t) *dst;
  4366 	    *dst++ = (uint8_t) pack_1x128_32 (
  4367 		_mm_adds_epu16 (
  4368 		    pix_multiply_1x128 (
  4369 			xmm_alpha, unpack_32_1x128 (m)),
  4370 		    unpack_32_1x128 (d)));
  4372 	    w--;
  4378 static void
  4379 sse2_composite_add_n_8 (pixman_implementation_t *imp,
  4380 			pixman_composite_info_t *info)
  4382     PIXMAN_COMPOSITE_ARGS (info);
  4383     uint8_t     *dst_line, *dst;
  4384     int dst_stride;
  4385     int32_t w;
  4386     uint32_t src;
  4388     __m128i xmm_src;
  4390     PIXMAN_IMAGE_GET_LINE (
  4391 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  4393     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  4395     src >>= 24;
  4397     if (src == 0x00)
  4398 	return;
  4400     if (src == 0xff)
  4402 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
  4403 		     8, dest_x, dest_y, width, height, 0xff);
  4405 	return;
  4408     src = (src << 24) | (src << 16) | (src << 8) | src;
  4409     xmm_src = _mm_set_epi32 (src, src, src, src);
  4411     while (height--)
  4413 	dst = dst_line;
  4414 	dst_line += dst_stride;
  4415 	w = width;
  4417 	while (w && ((uintptr_t)dst & 15))
  4419 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
  4420 		_mm_adds_epu8 (
  4421 		    xmm_src,
  4422 		    _mm_cvtsi32_si128 (*dst)));
  4424 	    w--;
  4425 	    dst++;
  4428 	while (w >= 16)
  4430 	    save_128_aligned (
  4431 		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
  4433 	    dst += 16;
  4434 	    w -= 16;
  4437 	while (w)
  4439 	    *dst = (uint8_t)_mm_cvtsi128_si32 (
  4440 		_mm_adds_epu8 (
  4441 		    xmm_src,
  4442 		    _mm_cvtsi32_si128 (*dst)));
  4444 	    w--;
  4445 	    dst++;
  4451 static void
  4452 sse2_composite_add_8_8 (pixman_implementation_t *imp,
  4453 			pixman_composite_info_t *info)
  4455     PIXMAN_COMPOSITE_ARGS (info);
  4456     uint8_t     *dst_line, *dst;
  4457     uint8_t     *src_line, *src;
  4458     int dst_stride, src_stride;
  4459     int32_t w;
  4460     uint16_t t;
  4462     PIXMAN_IMAGE_GET_LINE (
  4463 	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  4464     PIXMAN_IMAGE_GET_LINE (
  4465 	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  4467     while (height--)
  4469 	dst = dst_line;
  4470 	src = src_line;
  4472 	dst_line += dst_stride;
  4473 	src_line += src_stride;
  4474 	w = width;
  4476 	/* Small head */
  4477 	while (w && (uintptr_t)dst & 3)
  4479 	    t = (*dst) + (*src++);
  4480 	    *dst++ = t | (0 - (t >> 8));
  4481 	    w--;
  4484 	sse2_combine_add_u (imp, op,
  4485 			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
  4487 	/* Small tail */
  4488 	dst += w & 0xfffc;
  4489 	src += w & 0xfffc;
  4491 	w &= 3;
  4493 	while (w)
  4495 	    t = (*dst) + (*src++);
  4496 	    *dst++ = t | (0 - (t >> 8));
  4497 	    w--;
  4503 static void
  4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
  4505                               pixman_composite_info_t *info)
  4507     PIXMAN_COMPOSITE_ARGS (info);
  4508     uint32_t    *dst_line, *dst;
  4509     uint32_t    *src_line, *src;
  4510     int dst_stride, src_stride;
  4512     PIXMAN_IMAGE_GET_LINE (
  4513 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  4514     PIXMAN_IMAGE_GET_LINE (
  4515 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  4517     while (height--)
  4519 	dst = dst_line;
  4520 	dst_line += dst_stride;
  4521 	src = src_line;
  4522 	src_line += src_stride;
  4524 	sse2_combine_add_u (imp, op, dst, src, NULL, width);
  4528 static void
  4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
  4530 			   pixman_composite_info_t *info)
  4532     PIXMAN_COMPOSITE_ARGS (info);
  4533     uint32_t *dst_line, *dst, src;
  4534     int dst_stride;
  4536     __m128i xmm_src;
  4538     PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  4540     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  4541     if (src == 0)
  4542 	return;
  4544     if (src == ~0)
  4546 	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
  4547 		     dest_x, dest_y, width, height, ~0);
  4549 	return;
  4552     xmm_src = _mm_set_epi32 (src, src, src, src);
  4553     while (height--)
  4555 	int w = width;
  4556 	uint32_t d;
  4558 	dst = dst_line;
  4559 	dst_line += dst_stride;
  4561 	while (w && (unsigned long)dst & 15)
  4563 	    d = *dst;
  4564 	    *dst++ =
  4565 		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
  4566 	    w--;
  4569 	while (w >= 4)
  4571 	    save_128_aligned
  4572 		((__m128i*)dst,
  4573 		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
  4575 	    dst += 4;
  4576 	    w -= 4;
  4579 	while (w--)
  4581 	    d = *dst;
  4582 	    *dst++ =
  4583 		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
  4584 						  _mm_cvtsi32_si128 (d)));
  4589 static void
  4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
  4591 			     pixman_composite_info_t *info)
  4593     PIXMAN_COMPOSITE_ARGS (info);
  4594     uint32_t     *dst_line, *dst;
  4595     uint8_t     *mask_line, *mask;
  4596     int dst_stride, mask_stride;
  4597     int32_t w;
  4598     uint32_t src;
  4600     __m128i xmm_src;
  4602     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  4603     if (src == 0)
  4604 	return;
  4605     xmm_src = expand_pixel_32_1x128 (src);
  4607     PIXMAN_IMAGE_GET_LINE (
  4608 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  4609     PIXMAN_IMAGE_GET_LINE (
  4610 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  4612     while (height--)
  4614 	dst = dst_line;
  4615 	dst_line += dst_stride;
  4616 	mask = mask_line;
  4617 	mask_line += mask_stride;
  4618 	w = width;
  4620 	while (w && ((unsigned long)dst & 15))
  4622 	    uint8_t m = *mask++;
  4623 	    if (m)
  4625 		*dst = pack_1x128_32
  4626 		    (_mm_adds_epu16
  4627 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
  4628 		      unpack_32_1x128 (*dst)));
  4630 	    dst++;
  4631 	    w--;
  4634 	while (w >= 4)
  4636 	    uint32_t m = *(uint32_t*)mask;
  4637 	    if (m)
  4639 		__m128i xmm_mask_lo, xmm_mask_hi;
  4640 		__m128i xmm_dst_lo, xmm_dst_hi;
  4642 		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
  4643 		__m128i xmm_mask =
  4644 		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
  4645 				       _mm_setzero_si128 ());
  4647 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  4648 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  4650 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  4651 					&xmm_mask_lo, &xmm_mask_hi);
  4653 		pix_multiply_2x128 (&xmm_src, &xmm_src,
  4654 				    &xmm_mask_lo, &xmm_mask_hi,
  4655 				    &xmm_mask_lo, &xmm_mask_hi);
  4657 		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
  4658 		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
  4660 		save_128_aligned (
  4661 		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  4664 	    w -= 4;
  4665 	    dst += 4;
  4666 	    mask += 4;
  4669 	while (w)
  4671 	    uint8_t m = *mask++;
  4672 	    if (m)
  4674 		*dst = pack_1x128_32
  4675 		    (_mm_adds_epu16
  4676 		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
  4677 		      unpack_32_1x128 (*dst)));
  4679 	    dst++;
  4680 	    w--;
  4685 static pixman_bool_t
  4686 sse2_blt (pixman_implementation_t *imp,
  4687           uint32_t *               src_bits,
  4688           uint32_t *               dst_bits,
  4689           int                      src_stride,
  4690           int                      dst_stride,
  4691           int                      src_bpp,
  4692           int                      dst_bpp,
  4693           int                      src_x,
  4694           int                      src_y,
  4695           int                      dest_x,
  4696           int                      dest_y,
  4697           int                      width,
  4698           int                      height)
  4700     uint8_t *   src_bytes;
  4701     uint8_t *   dst_bytes;
  4702     int byte_width;
  4704     if (src_bpp != dst_bpp)
  4705 	return FALSE;
  4707     if (src_bpp == 16)
  4709 	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
  4710 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
  4711 	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
  4712 	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
  4713 	byte_width = 2 * width;
  4714 	src_stride *= 2;
  4715 	dst_stride *= 2;
  4717     else if (src_bpp == 32)
  4719 	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
  4720 	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
  4721 	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
  4722 	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
  4723 	byte_width = 4 * width;
  4724 	src_stride *= 4;
  4725 	dst_stride *= 4;
  4727     else
  4729 	return FALSE;
  4732     while (height--)
  4734 	int w;
  4735 	uint8_t *s = src_bytes;
  4736 	uint8_t *d = dst_bytes;
  4737 	src_bytes += src_stride;
  4738 	dst_bytes += dst_stride;
  4739 	w = byte_width;
  4741 	while (w >= 2 && ((uintptr_t)d & 3))
  4743 	    *(uint16_t *)d = *(uint16_t *)s;
  4744 	    w -= 2;
  4745 	    s += 2;
  4746 	    d += 2;
  4749 	while (w >= 4 && ((uintptr_t)d & 15))
  4751 	    *(uint32_t *)d = *(uint32_t *)s;
  4753 	    w -= 4;
  4754 	    s += 4;
  4755 	    d += 4;
  4758 	while (w >= 64)
  4760 	    __m128i xmm0, xmm1, xmm2, xmm3;
  4762 	    xmm0 = load_128_unaligned ((__m128i*)(s));
  4763 	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
  4764 	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
  4765 	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
  4767 	    save_128_aligned ((__m128i*)(d),    xmm0);
  4768 	    save_128_aligned ((__m128i*)(d + 16), xmm1);
  4769 	    save_128_aligned ((__m128i*)(d + 32), xmm2);
  4770 	    save_128_aligned ((__m128i*)(d + 48), xmm3);
  4772 	    s += 64;
  4773 	    d += 64;
  4774 	    w -= 64;
  4777 	while (w >= 16)
  4779 	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
  4781 	    w -= 16;
  4782 	    d += 16;
  4783 	    s += 16;
  4786 	while (w >= 4)
  4788 	    *(uint32_t *)d = *(uint32_t *)s;
  4790 	    w -= 4;
  4791 	    s += 4;
  4792 	    d += 4;
  4795 	if (w >= 2)
  4797 	    *(uint16_t *)d = *(uint16_t *)s;
  4798 	    w -= 2;
  4799 	    s += 2;
  4800 	    d += 2;
  4804     return TRUE;
  4807 static void
  4808 sse2_composite_copy_area (pixman_implementation_t *imp,
  4809                           pixman_composite_info_t *info)
  4811     PIXMAN_COMPOSITE_ARGS (info);
  4812     sse2_blt (imp, src_image->bits.bits,
  4813 	      dest_image->bits.bits,
  4814 	      src_image->bits.rowstride,
  4815 	      dest_image->bits.rowstride,
  4816 	      PIXMAN_FORMAT_BPP (src_image->bits.format),
  4817 	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
  4818 	      src_x, src_y, dest_x, dest_y, width, height);
  4821 static void
  4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
  4823                                  pixman_composite_info_t *info)
  4825     PIXMAN_COMPOSITE_ARGS (info);
  4826     uint32_t    *src, *src_line, s;
  4827     uint32_t    *dst, *dst_line, d;
  4828     uint8_t         *mask, *mask_line;
  4829     uint32_t m;
  4830     int src_stride, mask_stride, dst_stride;
  4831     int32_t w;
  4832     __m128i ms;
  4834     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  4835     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  4836     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  4838     PIXMAN_IMAGE_GET_LINE (
  4839 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  4840     PIXMAN_IMAGE_GET_LINE (
  4841 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  4842     PIXMAN_IMAGE_GET_LINE (
  4843 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  4845     while (height--)
  4847         src = src_line;
  4848         src_line += src_stride;
  4849         dst = dst_line;
  4850         dst_line += dst_stride;
  4851         mask = mask_line;
  4852         mask_line += mask_stride;
  4854         w = width;
  4856         while (w && (uintptr_t)dst & 15)
  4858             s = 0xff000000 | *src++;
  4859             m = (uint32_t) *mask++;
  4860             d = *dst;
  4861             ms = unpack_32_1x128 (s);
  4863             if (m != 0xff)
  4865 		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  4866 		__m128i md = unpack_32_1x128 (d);
  4868                 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
  4871             *dst++ = pack_1x128_32 (ms);
  4872             w--;
  4875         while (w >= 4)
  4877             m = *(uint32_t*) mask;
  4878             xmm_src = _mm_or_si128 (
  4879 		load_128_unaligned ((__m128i*)src), mask_ff000000);
  4881             if (m == 0xffffffff)
  4883                 save_128_aligned ((__m128i*)dst, xmm_src);
  4885             else
  4887                 xmm_dst = load_128_aligned ((__m128i*)dst);
  4889                 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
  4891                 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  4892                 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  4893                 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  4895                 expand_alpha_rev_2x128 (
  4896 		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  4898                 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  4899 			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
  4900 			       &xmm_dst_lo, &xmm_dst_hi);
  4902                 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  4905             src += 4;
  4906             dst += 4;
  4907             mask += 4;
  4908             w -= 4;
  4911         while (w)
  4913             m = (uint32_t) *mask++;
  4915             if (m)
  4917                 s = 0xff000000 | *src;
  4919                 if (m == 0xff)
  4921                     *dst = s;
  4923                 else
  4925 		    __m128i ma, md, ms;
  4927                     d = *dst;
  4929 		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  4930 		    md = unpack_32_1x128 (d);
  4931 		    ms = unpack_32_1x128 (s);
  4933                     *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
  4938             src++;
  4939             dst++;
  4940             w--;
  4946 static void
  4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
  4948                                  pixman_composite_info_t *info)
  4950     PIXMAN_COMPOSITE_ARGS (info);
  4951     uint32_t    *src, *src_line, s;
  4952     uint32_t    *dst, *dst_line, d;
  4953     uint8_t         *mask, *mask_line;
  4954     uint32_t m;
  4955     int src_stride, mask_stride, dst_stride;
  4956     int32_t w;
  4958     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
  4959     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  4960     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  4962     PIXMAN_IMAGE_GET_LINE (
  4963 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  4964     PIXMAN_IMAGE_GET_LINE (
  4965 	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  4966     PIXMAN_IMAGE_GET_LINE (
  4967 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  4969     while (height--)
  4971         src = src_line;
  4972         src_line += src_stride;
  4973         dst = dst_line;
  4974         dst_line += dst_stride;
  4975         mask = mask_line;
  4976         mask_line += mask_stride;
  4978         w = width;
  4980         while (w && (uintptr_t)dst & 15)
  4982 	    uint32_t sa;
  4984             s = *src++;
  4985             m = (uint32_t) *mask++;
  4986             d = *dst;
  4988 	    sa = s >> 24;
  4990 	    if (m)
  4992 		if (sa == 0xff && m == 0xff)
  4994 		    *dst = s;
  4996 		else
  4998 		    __m128i ms, md, ma, msa;
  5000 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  5001 		    ms = unpack_32_1x128 (s);
  5002 		    md = unpack_32_1x128 (d);
  5004 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  5006 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  5010 	    dst++;
  5011             w--;
  5014         while (w >= 4)
  5016             m = *(uint32_t *) mask;
  5018 	    if (m)
  5020 		xmm_src = load_128_unaligned ((__m128i*)src);
  5022 		if (m == 0xffffffff && is_opaque (xmm_src))
  5024 		    save_128_aligned ((__m128i *)dst, xmm_src);
  5026 		else
  5028 		    xmm_dst = load_128_aligned ((__m128i *)dst);
  5030 		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
  5032 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  5033 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  5034 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  5036 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
  5037 		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  5039 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
  5040 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
  5042 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  5046             src += 4;
  5047             dst += 4;
  5048             mask += 4;
  5049             w -= 4;
  5052         while (w)
  5054 	    uint32_t sa;
  5056             s = *src++;
  5057             m = (uint32_t) *mask++;
  5058             d = *dst;
  5060 	    sa = s >> 24;
  5062 	    if (m)
  5064 		if (sa == 0xff && m == 0xff)
  5066 		    *dst = s;
  5068 		else
  5070 		    __m128i ms, md, ma, msa;
  5072 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  5073 		    ms = unpack_32_1x128 (s);
  5074 		    md = unpack_32_1x128 (d);
  5076 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  5078 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  5082 	    dst++;
  5083             w--;
  5089 static void
  5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
  5091 				    pixman_composite_info_t *info)
  5093     PIXMAN_COMPOSITE_ARGS (info);
  5094     uint32_t src;
  5095     uint32_t    *dst_line, *dst;
  5096     __m128i xmm_src;
  5097     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  5098     __m128i xmm_dsta_hi, xmm_dsta_lo;
  5099     int dst_stride;
  5100     int32_t w;
  5102     src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  5104     if (src == 0)
  5105 	return;
  5107     PIXMAN_IMAGE_GET_LINE (
  5108 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  5110     xmm_src = expand_pixel_32_1x128 (src);
  5112     while (height--)
  5114 	dst = dst_line;
  5116 	dst_line += dst_stride;
  5117 	w = width;
  5119 	while (w && (uintptr_t)dst & 15)
  5121 	    __m128i vd;
  5123 	    vd = unpack_32_1x128 (*dst);
  5125 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
  5126 					      xmm_src));
  5127 	    w--;
  5128 	    dst++;
  5131 	while (w >= 4)
  5133 	    __m128i tmp_lo, tmp_hi;
  5135 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  5137 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  5138 	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
  5140 	    tmp_lo = xmm_src;
  5141 	    tmp_hi = xmm_src;
  5143 	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  5144 			&xmm_dsta_lo, &xmm_dsta_hi,
  5145 			&tmp_lo, &tmp_hi);
  5147 	    save_128_aligned (
  5148 		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
  5150 	    w -= 4;
  5151 	    dst += 4;
  5154 	while (w)
  5156 	    __m128i vd;
  5158 	    vd = unpack_32_1x128 (*dst);
  5160 	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
  5161 					      xmm_src));
  5162 	    w--;
  5163 	    dst++;
  5170 static void
  5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
  5172 				    pixman_composite_info_t *info)
  5174     PIXMAN_COMPOSITE_ARGS (info);
  5175     uint32_t    *src, *src_line, s;
  5176     uint32_t    *dst, *dst_line, d;
  5177     uint32_t    *mask, *mask_line;
  5178     uint32_t    m;
  5179     int src_stride, mask_stride, dst_stride;
  5180     int32_t w;
  5182     __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
  5183     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  5184     __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  5186     PIXMAN_IMAGE_GET_LINE (
  5187 	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  5188     PIXMAN_IMAGE_GET_LINE (
  5189 	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  5190     PIXMAN_IMAGE_GET_LINE (
  5191 	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  5193     while (height--)
  5195         src = src_line;
  5196         src_line += src_stride;
  5197         dst = dst_line;
  5198         dst_line += dst_stride;
  5199         mask = mask_line;
  5200         mask_line += mask_stride;
  5202         w = width;
  5204         while (w && (uintptr_t)dst & 15)
  5206 	    uint32_t sa;
  5208             s = *src++;
  5209             m = (*mask++) >> 24;
  5210             d = *dst;
  5212 	    sa = s >> 24;
  5214 	    if (m)
  5216 		if (sa == 0xff && m == 0xff)
  5218 		    *dst = s;
  5220 		else
  5222 		    __m128i ms, md, ma, msa;
  5224 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  5225 		    ms = unpack_32_1x128 (s);
  5226 		    md = unpack_32_1x128 (d);
  5228 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  5230 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  5234 	    dst++;
  5235             w--;
  5238         while (w >= 4)
  5240 	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  5242 	    if (!is_transparent (xmm_mask))
  5244 		xmm_src = load_128_unaligned ((__m128i*)src);
  5246 		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
  5248 		    save_128_aligned ((__m128i *)dst, xmm_src);
  5250 		else
  5252 		    xmm_dst = load_128_aligned ((__m128i *)dst);
  5254 		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  5255 		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  5256 		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  5258 		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
  5259 		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  5261 		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
  5262 				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
  5264 		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  5268             src += 4;
  5269             dst += 4;
  5270             mask += 4;
  5271             w -= 4;
  5274         while (w)
  5276 	    uint32_t sa;
  5278             s = *src++;
  5279             m = (*mask++) >> 24;
  5280             d = *dst;
  5282 	    sa = s >> 24;
  5284 	    if (m)
  5286 		if (sa == 0xff && m == 0xff)
  5288 		    *dst = s;
  5290 		else
  5292 		    __m128i ms, md, ma, msa;
  5294 		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  5295 		    ms = unpack_32_1x128 (s);
  5296 		    md = unpack_32_1x128 (d);
  5298 		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  5300 		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  5304 	    dst++;
  5305             w--;
  5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */
  5312 static force_inline void
  5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
  5314                                              const uint32_t* ps,
  5315                                              int32_t         w,
  5316                                              pixman_fixed_t  vx,
  5317                                              pixman_fixed_t  unit_x,
  5318                                              pixman_fixed_t  src_width_fixed,
  5319                                              pixman_bool_t   fully_transparent_src)
  5321     uint32_t s, d;
  5322     const uint32_t* pm = NULL;
  5324     __m128i xmm_dst_lo, xmm_dst_hi;
  5325     __m128i xmm_src_lo, xmm_src_hi;
  5326     __m128i xmm_alpha_lo, xmm_alpha_hi;
  5328     if (fully_transparent_src)
  5329 	return;
  5331     /* Align dst on a 16-byte boundary */
  5332     while (w && ((uintptr_t)pd & 15))
  5334 	d = *pd;
  5335 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
  5336 	vx += unit_x;
  5337 	while (vx >= 0)
  5338 	    vx -= src_width_fixed;
  5340 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
  5341 	if (pm)
  5342 	    pm++;
  5343 	w--;
  5346     while (w >= 4)
  5348 	__m128i tmp;
  5349 	uint32_t tmp1, tmp2, tmp3, tmp4;
  5351 	tmp1 = *(ps + pixman_fixed_to_int (vx));
  5352 	vx += unit_x;
  5353 	while (vx >= 0)
  5354 	    vx -= src_width_fixed;
  5355 	tmp2 = *(ps + pixman_fixed_to_int (vx));
  5356 	vx += unit_x;
  5357 	while (vx >= 0)
  5358 	    vx -= src_width_fixed;
  5359 	tmp3 = *(ps + pixman_fixed_to_int (vx));
  5360 	vx += unit_x;
  5361 	while (vx >= 0)
  5362 	    vx -= src_width_fixed;
  5363 	tmp4 = *(ps + pixman_fixed_to_int (vx));
  5364 	vx += unit_x;
  5365 	while (vx >= 0)
  5366 	    vx -= src_width_fixed;
  5368 	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
  5370 	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
  5372 	if (is_opaque (xmm_src_hi))
  5374 	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
  5376 	else if (!is_zero (xmm_src_hi))
  5378 	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  5380 	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  5381 	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  5383 	    expand_alpha_2x128 (
  5384 		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
  5386 	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
  5387 			&xmm_alpha_lo, &xmm_alpha_hi,
  5388 			&xmm_dst_lo, &xmm_dst_hi);
  5390 	    /* rebuid the 4 pixel data and save*/
  5391 	    save_128_aligned ((__m128i*)pd,
  5392 			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  5395 	w -= 4;
  5396 	pd += 4;
  5397 	if (pm)
  5398 	    pm += 4;
  5401     while (w)
  5403 	d = *pd;
  5404 	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
  5405 	vx += unit_x;
  5406 	while (vx >= 0)
  5407 	    vx -= src_width_fixed;
  5409 	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
  5410 	if (pm)
  5411 	    pm++;
  5413 	w--;
  5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
  5418 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  5419 		       uint32_t, uint32_t, COVER)
  5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
  5421 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  5422 		       uint32_t, uint32_t, NONE)
  5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
  5424 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  5425 		       uint32_t, uint32_t, PAD)
  5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
  5427 		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  5428 		       uint32_t, uint32_t, NORMAL)
  5430 static force_inline void
  5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
  5432 					       uint32_t *       dst,
  5433 					       const uint32_t * src,
  5434 					       int32_t          w,
  5435 					       pixman_fixed_t   vx,
  5436 					       pixman_fixed_t   unit_x,
  5437 					       pixman_fixed_t   src_width_fixed,
  5438 					       pixman_bool_t    zero_src)
  5440     __m128i xmm_mask;
  5441     __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  5442     __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  5443     __m128i xmm_alpha_lo, xmm_alpha_hi;
  5445     if (zero_src || (*mask >> 24) == 0)
  5446 	return;
  5448     xmm_mask = create_mask_16_128 (*mask >> 24);
  5450     while (w && (uintptr_t)dst & 15)
  5452 	uint32_t s = *(src + pixman_fixed_to_int (vx));
  5453 	vx += unit_x;
  5454 	while (vx >= 0)
  5455 	    vx -= src_width_fixed;
  5457 	if (s)
  5459 	    uint32_t d = *dst;
  5461 	    __m128i ms = unpack_32_1x128 (s);
  5462 	    __m128i alpha     = expand_alpha_1x128 (ms);
  5463 	    __m128i dest      = xmm_mask;
  5464 	    __m128i alpha_dst = unpack_32_1x128 (d);
  5466 	    *dst = pack_1x128_32 (
  5467 		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  5469 	dst++;
  5470 	w--;
  5473     while (w >= 4)
  5475 	uint32_t tmp1, tmp2, tmp3, tmp4;
  5477 	tmp1 = *(src + pixman_fixed_to_int (vx));
  5478 	vx += unit_x;
  5479 	while (vx >= 0)
  5480 	    vx -= src_width_fixed;
  5481 	tmp2 = *(src + pixman_fixed_to_int (vx));
  5482 	vx += unit_x;
  5483 	while (vx >= 0)
  5484 	    vx -= src_width_fixed;
  5485 	tmp3 = *(src + pixman_fixed_to_int (vx));
  5486 	vx += unit_x;
  5487 	while (vx >= 0)
  5488 	    vx -= src_width_fixed;
  5489 	tmp4 = *(src + pixman_fixed_to_int (vx));
  5490 	vx += unit_x;
  5491 	while (vx >= 0)
  5492 	    vx -= src_width_fixed;
  5494 	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
  5496 	if (!is_zero (xmm_src))
  5498 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  5500 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  5501 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  5502 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  5503 			        &xmm_alpha_lo, &xmm_alpha_hi);
  5505 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  5506 			   &xmm_alpha_lo, &xmm_alpha_hi,
  5507 			   &xmm_mask, &xmm_mask,
  5508 			   &xmm_dst_lo, &xmm_dst_hi);
  5510 	    save_128_aligned (
  5511 		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  5514 	dst += 4;
  5515 	w -= 4;
  5518     while (w)
  5520 	uint32_t s = *(src + pixman_fixed_to_int (vx));
  5521 	vx += unit_x;
  5522 	while (vx >= 0)
  5523 	    vx -= src_width_fixed;
  5525 	if (s)
  5527 	    uint32_t d = *dst;
  5529 	    __m128i ms = unpack_32_1x128 (s);
  5530 	    __m128i alpha = expand_alpha_1x128 (ms);
  5531 	    __m128i mask  = xmm_mask;
  5532 	    __m128i dest  = unpack_32_1x128 (d);
  5534 	    *dst = pack_1x128_32 (
  5535 		in_over_1x128 (&ms, &alpha, &mask, &dest));
  5538 	dst++;
  5539 	w--;
  5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
  5545 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  5546 			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
  5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
  5548 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  5549 			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
  5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
  5551 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  5552 			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
  5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
  5554 			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  5555 			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
  5557 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
  5559 #define BILINEAR_DECLARE_VARIABLES						\
  5560     const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
  5561     const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
  5562     const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
  5563     const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
  5564     const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
  5565     const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
  5566     const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
  5567 					  unit_x, unit_x, unit_x, unit_x);	\
  5568     const __m128i xmm_zero = _mm_setzero_si128 ();				\
  5569     __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
  5571 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
  5572 do {										\
  5573     __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
  5574     /* fetch 2x2 pixel block into sse2 registers */				\
  5575     __m128i tltr = _mm_loadl_epi64 (						\
  5576 			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
  5577     __m128i blbr = _mm_loadl_epi64 (						\
  5578 			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
  5579     vx += unit_x;								\
  5580     /* vertical interpolation */						\
  5581     a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
  5582 					xmm_wt),				\
  5583 		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
  5584 					xmm_wb));				\
  5585     if (BILINEAR_INTERPOLATION_BITS < 8)					\
  5586     {										\
  5587 	/* calculate horizontal weights */					\
  5588 	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
  5589 		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
  5590 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
  5591 	/* horizontal interpolation */						\
  5592 	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
  5593 		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
  5594     }										\
  5595     else									\
  5596     {										\
  5597 	/* calculate horizontal weights */					\
  5598 	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
  5599 		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
  5600 	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
  5601 	/* horizontal interpolation */						\
  5602 	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
  5603 	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
  5604 	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
  5605 			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
  5606     }										\
  5607     /* shift and pack the result */						\
  5608     a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
  5609     a = _mm_packs_epi32 (a, a);							\
  5610     a = _mm_packus_epi16 (a, a);						\
  5611     pix = _mm_cvtsi128_si32 (a);						\
  5612 } while (0)
  5614 #define BILINEAR_SKIP_ONE_PIXEL()						\
  5615 do {										\
  5616     vx += unit_x;								\
  5617     xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
  5618 } while(0)
  5620 static force_inline void
  5621 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
  5622 					     const uint32_t * mask,
  5623 					     const uint32_t * src_top,
  5624 					     const uint32_t * src_bottom,
  5625 					     int32_t          w,
  5626 					     int              wt,
  5627 					     int              wb,
  5628 					     pixman_fixed_t   vx,
  5629 					     pixman_fixed_t   unit_x,
  5630 					     pixman_fixed_t   max_vx,
  5631 					     pixman_bool_t    zero_src)
  5633     BILINEAR_DECLARE_VARIABLES;
  5634     uint32_t pix1, pix2, pix3, pix4;
  5636     while ((w -= 4) >= 0)
  5638 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5639 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  5640 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  5641 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  5642 	*dst++ = pix1;
  5643 	*dst++ = pix2;
  5644 	*dst++ = pix3;
  5645 	*dst++ = pix4;
  5648     if (w & 2)
  5650 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5651 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  5652 	*dst++ = pix1;
  5653 	*dst++ = pix2;
  5656     if (w & 1)
  5658 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5659 	*dst = pix1;
  5664 /* Add extra NULL argument to the existing bilinear fast paths to indicate
  5665  * that we don't need two-pass processing */
  5667 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
  5668 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  5669 			       uint32_t, uint32_t, uint32_t,
  5670 			       COVER, FLAG_NONE)
  5671 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
  5672 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  5673 			       uint32_t, uint32_t, uint32_t,
  5674 			       PAD, FLAG_NONE)
  5675 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
  5676 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  5677 			       uint32_t, uint32_t, uint32_t,
  5678 			       NONE, FLAG_NONE)
  5679 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
  5680 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  5681 			       uint32_t, uint32_t, uint32_t,
  5682 			       NORMAL, FLAG_NONE)
  5684 static force_inline void
  5685 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
  5686 					      const uint32_t * mask,
  5687 					      const uint32_t * src_top,
  5688 					      const uint32_t * src_bottom,
  5689 					      int32_t          w,
  5690 					      int              wt,
  5691 					      int              wb,
  5692 					      pixman_fixed_t   vx,
  5693 					      pixman_fixed_t   unit_x,
  5694 					      pixman_fixed_t   max_vx,
  5695 					      pixman_bool_t    zero_src)
  5697     BILINEAR_DECLARE_VARIABLES;
  5698     uint32_t pix1, pix2, pix3, pix4;
  5700     while (w && ((uintptr_t)dst & 15))
  5702 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5704 	if (pix1)
  5706 	    pix2 = *dst;
  5707 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
  5710 	w--;
  5711 	dst++;
  5714     while (w  >= 4)
  5716 	__m128i xmm_src;
  5717 	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
  5718 	__m128i xmm_alpha_hi, xmm_alpha_lo;
  5720 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5721 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  5722 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  5723 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  5725 	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
  5727 	if (!is_zero (xmm_src))
  5729 	    if (is_opaque (xmm_src))
  5731 		save_128_aligned ((__m128i *)dst, xmm_src);
  5733 	    else
  5735 		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
  5737 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  5738 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  5740 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
  5741 		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
  5742 			    &xmm_dst_lo, &xmm_dst_hi);
  5744 		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  5748 	w -= 4;
  5749 	dst += 4;
  5752     while (w)
  5754 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5756 	if (pix1)
  5758 	    pix2 = *dst;
  5759 	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
  5762 	w--;
  5763 	dst++;
  5767 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
  5768 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  5769 			       uint32_t, uint32_t, uint32_t,
  5770 			       COVER, FLAG_NONE)
  5771 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
  5772 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  5773 			       uint32_t, uint32_t, uint32_t,
  5774 			       PAD, FLAG_NONE)
  5775 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
  5776 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  5777 			       uint32_t, uint32_t, uint32_t,
  5778 			       NONE, FLAG_NONE)
  5779 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
  5780 			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  5781 			       uint32_t, uint32_t, uint32_t,
  5782 			       NORMAL, FLAG_NONE)
  5785 /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
  5786    as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
  5788 void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
  5790     /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
  5791     while (--width >= 0)
  5793 	*dst = composite_over_8888_0565pixel (*src, *dst);
  5794 	src++;
  5795 	dst++;
  5799 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
  5800 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  5801 			       uint32_t, uint32_t, uint16_t,
  5802 			       COVER, FLAG_NONE)
  5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
  5804 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  5805 			       uint32_t, uint32_t, uint16_t,
  5806 			       PAD, FLAG_NONE)
  5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
  5808 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  5809 			       uint32_t, uint32_t, uint16_t,
  5810 			       NONE, FLAG_NONE)
  5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
  5812 			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  5813 			       uint32_t, uint32_t, uint16_t,
  5814 			       NORMAL, FLAG_NONE)
  5816 /*****************************/
  5818 static force_inline void
  5819 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
  5820 						const uint8_t  * mask,
  5821 						const uint32_t * src_top,
  5822 						const uint32_t * src_bottom,
  5823 						int32_t          w,
  5824 						int              wt,
  5825 						int              wb,
  5826 						pixman_fixed_t   vx,
  5827 						pixman_fixed_t   unit_x,
  5828 						pixman_fixed_t   max_vx,
  5829 						pixman_bool_t    zero_src)
  5831     BILINEAR_DECLARE_VARIABLES;
  5832     uint32_t pix1, pix2, pix3, pix4;
  5833     uint32_t m;
  5835     while (w && ((uintptr_t)dst & 15))
  5837 	uint32_t sa;
  5839 	m = (uint32_t) *mask++;
  5841 	if (m)
  5843 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5844 	    sa = pix1 >> 24;
  5846 	    if (sa == 0xff && m == 0xff)
  5848 		*dst = pix1;
  5850 	    else
  5852 		__m128i ms, md, ma, msa;
  5854 		pix2 = *dst;
  5855 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  5856 		ms = unpack_32_1x128 (pix1);
  5857 		md = unpack_32_1x128 (pix2);
  5859 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  5861 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  5864 	else
  5866 	    BILINEAR_SKIP_ONE_PIXEL ();
  5869 	w--;
  5870 	dst++;
  5873     while (w >= 4)
  5875 	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
  5876 	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  5877 	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  5879 	m = *(uint32_t*)mask;
  5881 	if (m)
  5883 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5884 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  5885 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  5886 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  5888 	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
  5890 	    if (m == 0xffffffff && is_opaque (xmm_src))
  5892 		save_128_aligned ((__m128i *)dst, xmm_src);
  5894 	    else
  5896 		xmm_dst = load_128_aligned ((__m128i *)dst);
  5898 		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
  5900 		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  5901 		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  5902 		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  5904 		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
  5905 		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  5907 		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
  5908 			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
  5910 		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  5913 	else
  5915 	    BILINEAR_SKIP_ONE_PIXEL ();
  5916 	    BILINEAR_SKIP_ONE_PIXEL ();
  5917 	    BILINEAR_SKIP_ONE_PIXEL ();
  5918 	    BILINEAR_SKIP_ONE_PIXEL ();
  5921 	w -= 4;
  5922 	dst += 4;
  5923 	mask += 4;
  5926     while (w)
  5928 	uint32_t sa;
  5930 	m = (uint32_t) *mask++;
  5932 	if (m)
  5934 	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  5935 	    sa = pix1 >> 24;
  5937 	    if (sa == 0xff && m == 0xff)
  5939 		*dst = pix1;
  5941 	    else
  5943 		__m128i ms, md, ma, msa;
  5945 		pix2 = *dst;
  5946 		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  5947 		ms = unpack_32_1x128 (pix1);
  5948 		md = unpack_32_1x128 (pix2);
  5950 		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  5952 		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  5955 	else
  5957 	    BILINEAR_SKIP_ONE_PIXEL ();
  5960 	w--;
  5961 	dst++;
  5965 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
  5966 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  5967 			       uint32_t, uint8_t, uint32_t,
  5968 			       COVER, FLAG_HAVE_NON_SOLID_MASK)
  5969 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
  5970 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  5971 			       uint32_t, uint8_t, uint32_t,
  5972 			       PAD, FLAG_HAVE_NON_SOLID_MASK)
  5973 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
  5974 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  5975 			       uint32_t, uint8_t, uint32_t,
  5976 			       NONE, FLAG_HAVE_NON_SOLID_MASK)
  5977 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
  5978 			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  5979 			       uint32_t, uint8_t, uint32_t,
  5980 			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
  5982 static force_inline void
  5983 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
  5984 						const uint32_t * mask,
  5985 						const uint32_t * src_top,
  5986 						const uint32_t * src_bottom,
  5987 						int32_t          w,
  5988 						int              wt,
  5989 						int              wb,
  5990 						pixman_fixed_t   vx,
  5991 						pixman_fixed_t   unit_x,
  5992 						pixman_fixed_t   max_vx,
  5993 						pixman_bool_t    zero_src)
  5995     BILINEAR_DECLARE_VARIABLES;
  5996     uint32_t pix1, pix2, pix3, pix4;
  5997     __m128i xmm_mask;
  5999     if (zero_src || (*mask >> 24) == 0)
  6000 	return;
  6002     xmm_mask = create_mask_16_128 (*mask >> 24);
  6004     while (w && ((uintptr_t)dst & 15))
  6006 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  6007 	if (pix1)
  6009 		uint32_t d = *dst;
  6011 		__m128i ms = unpack_32_1x128 (pix1);
  6012 		__m128i alpha     = expand_alpha_1x128 (ms);
  6013 		__m128i dest      = xmm_mask;
  6014 		__m128i alpha_dst = unpack_32_1x128 (d);
  6016 		*dst = pack_1x128_32
  6017 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  6020 	dst++;
  6021 	w--;
  6024     while (w >= 4)
  6026 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  6027 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  6028 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  6029 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  6031 	if (pix1 | pix2 | pix3 | pix4)
  6033 	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  6034 	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  6035 	    __m128i xmm_alpha_lo, xmm_alpha_hi;
  6037 	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
  6039 	    xmm_dst = load_128_aligned ((__m128i*)dst);
  6041 	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  6042 	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  6043 	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  6044 				&xmm_alpha_lo, &xmm_alpha_hi);
  6046 	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  6047 			   &xmm_alpha_lo, &xmm_alpha_hi,
  6048 			   &xmm_mask, &xmm_mask,
  6049 			   &xmm_dst_lo, &xmm_dst_hi);
  6051 	    save_128_aligned
  6052 		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  6055 	dst += 4;
  6056 	w -= 4;
  6059     while (w)
  6061 	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  6062 	if (pix1)
  6064 		uint32_t d = *dst;
  6066 		__m128i ms = unpack_32_1x128 (pix1);
  6067 		__m128i alpha     = expand_alpha_1x128 (ms);
  6068 		__m128i dest      = xmm_mask;
  6069 		__m128i alpha_dst = unpack_32_1x128 (d);
  6071 		*dst = pack_1x128_32
  6072 			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  6075 	dst++;
  6076 	w--;
  6080 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
  6081 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  6082 			       uint32_t, uint32_t, uint32_t,
  6083 			       COVER, FLAG_HAVE_SOLID_MASK)
  6084 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
  6085 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  6086 			       uint32_t, uint32_t, uint32_t,
  6087 			       PAD, FLAG_HAVE_SOLID_MASK)
  6088 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
  6089 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  6090 			       uint32_t, uint32_t, uint32_t,
  6091 			       NONE, FLAG_HAVE_SOLID_MASK)
  6092 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
  6093 			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  6094 			       uint32_t, uint32_t, uint32_t,
  6095 			       NORMAL, FLAG_HAVE_SOLID_MASK)
  6097 static const pixman_fast_path_t sse2_fast_paths[] =
  6099     /* PIXMAN_OP_OVER */
  6100     PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
  6101     PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
  6102     PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
  6103     PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
  6104     PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
  6105     PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
  6106     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
  6107     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
  6108     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
  6109     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
  6110     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
  6111     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
  6112     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
  6113     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
  6114     PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
  6115     PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
  6116     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
  6117     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
  6118     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
  6119     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
  6120     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
  6121     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
  6122     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
  6123     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
  6124     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
  6125     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
  6126     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
  6127     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
  6128     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
  6129     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
  6130     PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
  6131     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
  6132     PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
  6133     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
  6134     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
  6135     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
  6136     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
  6137     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
  6138     PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
  6139     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
  6140     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
  6141     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
  6142     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
  6143     PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
  6144     PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
  6145     PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
  6146     PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
  6148     /* PIXMAN_OP_OVER_REVERSE */
  6149     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
  6150     PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
  6152     /* PIXMAN_OP_ADD */
  6153     PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
  6154     PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
  6155     PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
  6156     PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
  6157     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
  6158     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
  6159     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
  6160     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
  6161     PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
  6162     PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
  6163     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
  6164     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
  6165     PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
  6166     PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
  6168     /* PIXMAN_OP_SRC */
  6169     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
  6170     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
  6171     PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
  6172     PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
  6173     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
  6174     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
  6175     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
  6176     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
  6177     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
  6178     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
  6179     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
  6180     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
  6181     PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
  6182     PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
  6183     PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
  6184     PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
  6185     PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
  6186     PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
  6188     /* PIXMAN_OP_IN */
  6189     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
  6190     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
  6191     PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
  6193     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6194     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6195     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  6196     SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  6197     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6198     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6199     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  6200     SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  6201     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6202     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6203     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  6204     SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  6205     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6206     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6207     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  6208     SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  6210     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
  6211     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
  6212     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
  6213     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  6214     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
  6215     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
  6216     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
  6217     SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  6219     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  6220     SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6221     SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6222     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  6223     SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6224     SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6226     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  6227     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  6228     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  6229     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  6231     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
  6232     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  6233     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
  6234     SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
  6236     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
  6237     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
  6238     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
  6239     SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
  6241     /* and here the needed entries are added to the fast path table */
  6243     SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
  6244     SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
  6246     { PIXMAN_OP_NONE },
  6247 };
  6249 static uint32_t *
  6250 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
  6252     int w = iter->width;
  6253     __m128i ff000000 = mask_ff000000;
  6254     uint32_t *dst = iter->buffer;
  6255     uint32_t *src = (uint32_t *)iter->bits;
  6257     iter->bits += iter->stride;
  6259     while (w && ((uintptr_t)dst) & 0x0f)
  6261 	*dst++ = (*src++) | 0xff000000;
  6262 	w--;
  6265     while (w >= 4)
  6267 	save_128_aligned (
  6268 	    (__m128i *)dst, _mm_or_si128 (
  6269 		load_128_unaligned ((__m128i *)src), ff000000));
  6271 	dst += 4;
  6272 	src += 4;
  6273 	w -= 4;
  6276     while (w)
  6278 	*dst++ = (*src++) | 0xff000000;
  6279 	w--;
  6282     return iter->buffer;
  6285 static uint32_t *
  6286 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
  6288     int w = iter->width;
  6289     uint32_t *dst = iter->buffer;
  6290     uint16_t *src = (uint16_t *)iter->bits;
  6291     __m128i ff000000 = mask_ff000000;
  6293     iter->bits += iter->stride;
  6295     while (w && ((uintptr_t)dst) & 0x0f)
  6297 	uint16_t s = *src++;
  6299 	*dst++ = convert_0565_to_8888 (s);
  6300 	w--;
  6303     while (w >= 8)
  6305 	__m128i lo, hi, s;
  6307 	s = _mm_loadu_si128 ((__m128i *)src);
  6309 	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
  6310 	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
  6312 	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
  6313 	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
  6315 	dst += 8;
  6316 	src += 8;
  6317 	w -= 8;
  6320     while (w)
  6322 	uint16_t s = *src++;
  6324 	*dst++ = convert_0565_to_8888 (s);
  6325 	w--;
  6328     return iter->buffer;
  6331 static uint32_t *
  6332 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
  6334     int w = iter->width;
  6335     uint32_t *dst = iter->buffer;
  6336     uint8_t *src = iter->bits;
  6337     __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
  6339     iter->bits += iter->stride;
  6341     while (w && (((uintptr_t)dst) & 15))
  6343         *dst++ = *(src++) << 24;
  6344         w--;
  6347     while (w >= 16)
  6349 	xmm0 = _mm_loadu_si128((__m128i *)src);
  6351 	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
  6352 	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
  6353 	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
  6354 	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
  6355 	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
  6356 	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
  6358 	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
  6359 	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
  6360 	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
  6361 	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
  6363 	dst += 16;
  6364 	src += 16;
  6365 	w -= 16;
  6368     while (w)
  6370 	*dst++ = *(src++) << 24;
  6371 	w--;
  6374     return iter->buffer;
  6377 typedef struct
  6379     pixman_format_code_t	format;
  6380     pixman_iter_get_scanline_t	get_scanline;
  6381 } fetcher_info_t;
  6383 static const fetcher_info_t fetchers[] =
  6385     { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
  6386     { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
  6387     { PIXMAN_a8,		sse2_fetch_a8 },
  6388     { PIXMAN_null }
  6389 };
  6391 static pixman_bool_t
  6392 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
  6394     pixman_image_t *image = iter->image;
  6396 #define FLAGS								\
  6397     (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
  6398      FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
  6400     if ((iter->iter_flags & ITER_NARROW)			&&
  6401 	(iter->image_flags & FLAGS) == FLAGS)
  6403 	const fetcher_info_t *f;
  6405 	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
  6407 	    if (image->common.extended_format_code == f->format)
  6409 		uint8_t *b = (uint8_t *)image->bits.bits;
  6410 		int s = image->bits.rowstride * 4;
  6412 		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
  6413 		iter->stride = s;
  6415 		iter->get_scanline = f->get_scanline;
  6416 		return TRUE;
  6421     return FALSE;
  6424 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
  6425 __attribute__((__force_align_arg_pointer__))
  6426 #endif
  6427 pixman_implementation_t *
  6428 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
  6430     pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
  6432     /* SSE2 constants */
  6433     mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
  6434     mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
  6435     mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
  6436     mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
  6437     mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
  6438     mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
  6439     mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
  6440     mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
  6441     mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
  6442     mask_0080 = create_mask_16_128 (0x0080);
  6443     mask_00ff = create_mask_16_128 (0x00ff);
  6444     mask_0101 = create_mask_16_128 (0x0101);
  6445     mask_ffff = create_mask_16_128 (0xffff);
  6446     mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
  6447     mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
  6448     mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
  6449     mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
  6451     /* Set up function pointers */
  6452     imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
  6453     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
  6454     imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
  6455     imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
  6456     imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
  6457     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
  6458     imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
  6459     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
  6460     imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
  6461     imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
  6463     imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
  6465     imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
  6466     imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
  6467     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
  6468     imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
  6469     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
  6470     imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
  6471     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
  6472     imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
  6473     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
  6474     imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
  6475     imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
  6477     imp->blt = sse2_blt;
  6478     imp->fill = sse2_fill;
  6480     imp->src_iter_init = sse2_src_iter_init;
  6482     return imp;

mercurial