gfx/cairo/libpixman/src/pixman-sse2.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-sse2.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,6483 @@
     1.4 +/*
     1.5 + * Copyright © 2008 Rodrigo Kumpera
     1.6 + * Copyright © 2008 André Tupinambá
     1.7 + *
     1.8 + * Permission to use, copy, modify, distribute, and sell this software and its
     1.9 + * documentation for any purpose is hereby granted without fee, provided that
    1.10 + * the above copyright notice appear in all copies and that both that
    1.11 + * copyright notice and this permission notice appear in supporting
    1.12 + * documentation, and that the name of Red Hat not be used in advertising or
    1.13 + * publicity pertaining to distribution of the software without specific,
    1.14 + * written prior permission.  Red Hat makes no representations about the
    1.15 + * suitability of this software for any purpose.  It is provided "as is"
    1.16 + * without express or implied warranty.
    1.17 + *
    1.18 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    1.19 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    1.20 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    1.21 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    1.22 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    1.23 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    1.24 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    1.25 + * SOFTWARE.
    1.26 + *
    1.27 + * Author:  Rodrigo Kumpera (kumpera@gmail.com)
    1.28 + *          André Tupinambá (andrelrt@gmail.com)
    1.29 + *
    1.30 + * Based on work by Owen Taylor and Søren Sandmann
    1.31 + */
    1.32 +#ifdef HAVE_CONFIG_H
    1.33 +#include <config.h>
    1.34 +#endif
    1.35 +
    1.36 +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
    1.37 +#include <emmintrin.h> /* for SSE2 intrinsics */
    1.38 +#include "pixman-private.h"
    1.39 +#include "pixman-combine32.h"
    1.40 +#include "pixman-inlines.h"
    1.41 +
    1.42 +static __m128i mask_0080;
    1.43 +static __m128i mask_00ff;
    1.44 +static __m128i mask_0101;
    1.45 +static __m128i mask_ffff;
    1.46 +static __m128i mask_ff000000;
    1.47 +static __m128i mask_alpha;
    1.48 +
    1.49 +static __m128i mask_565_r;
    1.50 +static __m128i mask_565_g1, mask_565_g2;
    1.51 +static __m128i mask_565_b;
    1.52 +static __m128i mask_red;
    1.53 +static __m128i mask_green;
    1.54 +static __m128i mask_blue;
    1.55 +
    1.56 +static __m128i mask_565_fix_rb;
    1.57 +static __m128i mask_565_fix_g;
    1.58 +
    1.59 +static __m128i mask_565_rb;
    1.60 +static __m128i mask_565_pack_multiplier;
    1.61 +
    1.62 +static force_inline __m128i
    1.63 +unpack_32_1x128 (uint32_t data)
    1.64 +{
    1.65 +    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
    1.66 +}
    1.67 +
    1.68 +static force_inline void
    1.69 +unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
    1.70 +{
    1.71 +    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
    1.72 +    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
    1.73 +}
    1.74 +
    1.75 +static force_inline __m128i
    1.76 +unpack_565_to_8888 (__m128i lo)
    1.77 +{
    1.78 +    __m128i r, g, b, rb, t;
    1.79 +
    1.80 +    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
    1.81 +    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
    1.82 +    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
    1.83 +
    1.84 +    rb = _mm_or_si128 (r, b);
    1.85 +    t  = _mm_and_si128 (rb, mask_565_fix_rb);
    1.86 +    t  = _mm_srli_epi32 (t, 5);
    1.87 +    rb = _mm_or_si128 (rb, t);
    1.88 +
    1.89 +    t  = _mm_and_si128 (g, mask_565_fix_g);
    1.90 +    t  = _mm_srli_epi32 (t, 6);
    1.91 +    g  = _mm_or_si128 (g, t);
    1.92 +
    1.93 +    return _mm_or_si128 (rb, g);
    1.94 +}
    1.95 +
    1.96 +static force_inline void
    1.97 +unpack_565_128_4x128 (__m128i  data,
    1.98 +                      __m128i* data0,
    1.99 +                      __m128i* data1,
   1.100 +                      __m128i* data2,
   1.101 +                      __m128i* data3)
   1.102 +{
   1.103 +    __m128i lo, hi;
   1.104 +
   1.105 +    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
   1.106 +    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
   1.107 +
   1.108 +    lo = unpack_565_to_8888 (lo);
   1.109 +    hi = unpack_565_to_8888 (hi);
   1.110 +
   1.111 +    unpack_128_2x128 (lo, data0, data1);
   1.112 +    unpack_128_2x128 (hi, data2, data3);
   1.113 +}
   1.114 +
   1.115 +static force_inline uint16_t
   1.116 +pack_565_32_16 (uint32_t pixel)
   1.117 +{
   1.118 +    return (uint16_t) (((pixel >> 8) & 0xf800) |
   1.119 +		       ((pixel >> 5) & 0x07e0) |
   1.120 +		       ((pixel >> 3) & 0x001f));
   1.121 +}
   1.122 +
   1.123 +static force_inline __m128i
   1.124 +pack_2x128_128 (__m128i lo, __m128i hi)
   1.125 +{
   1.126 +    return _mm_packus_epi16 (lo, hi);
   1.127 +}
   1.128 +
   1.129 +static force_inline __m128i
   1.130 +pack_565_2packedx128_128 (__m128i lo, __m128i hi)
   1.131 +{
   1.132 +    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
   1.133 +    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
   1.134 +
   1.135 +    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
   1.136 +    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
   1.137 +
   1.138 +    __m128i g0 = _mm_and_si128 (lo, mask_green);
   1.139 +    __m128i g1 = _mm_and_si128 (hi, mask_green);
   1.140 +
   1.141 +    t0 = _mm_or_si128 (t0, g0);
   1.142 +    t1 = _mm_or_si128 (t1, g1);
   1.143 +
   1.144 +    /* Simulates _mm_packus_epi32 */
   1.145 +    t0 = _mm_slli_epi32 (t0, 16 - 5);
   1.146 +    t1 = _mm_slli_epi32 (t1, 16 - 5);
   1.147 +    t0 = _mm_srai_epi32 (t0, 16);
   1.148 +    t1 = _mm_srai_epi32 (t1, 16);
   1.149 +    return _mm_packs_epi32 (t0, t1);
   1.150 +}
   1.151 +
   1.152 +static force_inline __m128i
   1.153 +pack_565_2x128_128 (__m128i lo, __m128i hi)
   1.154 +{
   1.155 +    __m128i data;
   1.156 +    __m128i r, g1, g2, b;
   1.157 +
   1.158 +    data = pack_2x128_128 (lo, hi);
   1.159 +
   1.160 +    r  = _mm_and_si128 (data, mask_565_r);
   1.161 +    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
   1.162 +    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
   1.163 +    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
   1.164 +
   1.165 +    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
   1.166 +}
   1.167 +
   1.168 +static force_inline __m128i
   1.169 +pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
   1.170 +{
   1.171 +    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
   1.172 +			     pack_565_2x128_128 (*xmm2, *xmm3));
   1.173 +}
   1.174 +
   1.175 +static force_inline int
   1.176 +is_opaque (__m128i x)
   1.177 +{
   1.178 +    __m128i ffs = _mm_cmpeq_epi8 (x, x);
   1.179 +
   1.180 +    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
   1.181 +}
   1.182 +
   1.183 +static force_inline int
   1.184 +is_zero (__m128i x)
   1.185 +{
   1.186 +    return _mm_movemask_epi8 (
   1.187 +	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
   1.188 +}
   1.189 +
   1.190 +static force_inline int
   1.191 +is_transparent (__m128i x)
   1.192 +{
   1.193 +    return (_mm_movemask_epi8 (
   1.194 +		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
   1.195 +}
   1.196 +
   1.197 +static force_inline __m128i
   1.198 +expand_pixel_32_1x128 (uint32_t data)
   1.199 +{
   1.200 +    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
   1.201 +}
   1.202 +
   1.203 +static force_inline __m128i
   1.204 +expand_alpha_1x128 (__m128i data)
   1.205 +{
   1.206 +    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
   1.207 +						     _MM_SHUFFLE (3, 3, 3, 3)),
   1.208 +				_MM_SHUFFLE (3, 3, 3, 3));
   1.209 +}
   1.210 +
   1.211 +static force_inline void
   1.212 +expand_alpha_2x128 (__m128i  data_lo,
   1.213 +                    __m128i  data_hi,
   1.214 +                    __m128i* alpha_lo,
   1.215 +                    __m128i* alpha_hi)
   1.216 +{
   1.217 +    __m128i lo, hi;
   1.218 +
   1.219 +    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
   1.220 +    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
   1.221 +
   1.222 +    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
   1.223 +    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
   1.224 +}
   1.225 +
   1.226 +static force_inline void
   1.227 +expand_alpha_rev_2x128 (__m128i  data_lo,
   1.228 +                        __m128i  data_hi,
   1.229 +                        __m128i* alpha_lo,
   1.230 +                        __m128i* alpha_hi)
   1.231 +{
   1.232 +    __m128i lo, hi;
   1.233 +
   1.234 +    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
   1.235 +    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
   1.236 +    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
   1.237 +    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
   1.238 +}
   1.239 +
   1.240 +static force_inline void
   1.241 +pix_multiply_2x128 (__m128i* data_lo,
   1.242 +                    __m128i* data_hi,
   1.243 +                    __m128i* alpha_lo,
   1.244 +                    __m128i* alpha_hi,
   1.245 +                    __m128i* ret_lo,
   1.246 +                    __m128i* ret_hi)
   1.247 +{
   1.248 +    __m128i lo, hi;
   1.249 +
   1.250 +    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
   1.251 +    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
   1.252 +    lo = _mm_adds_epu16 (lo, mask_0080);
   1.253 +    hi = _mm_adds_epu16 (hi, mask_0080);
   1.254 +    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
   1.255 +    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
   1.256 +}
   1.257 +
   1.258 +static force_inline void
   1.259 +pix_add_multiply_2x128 (__m128i* src_lo,
   1.260 +                        __m128i* src_hi,
   1.261 +                        __m128i* alpha_dst_lo,
   1.262 +                        __m128i* alpha_dst_hi,
   1.263 +                        __m128i* dst_lo,
   1.264 +                        __m128i* dst_hi,
   1.265 +                        __m128i* alpha_src_lo,
   1.266 +                        __m128i* alpha_src_hi,
   1.267 +                        __m128i* ret_lo,
   1.268 +                        __m128i* ret_hi)
   1.269 +{
   1.270 +    __m128i t1_lo, t1_hi;
   1.271 +    __m128i t2_lo, t2_hi;
   1.272 +
   1.273 +    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
   1.274 +    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
   1.275 +
   1.276 +    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
   1.277 +    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
   1.278 +}
   1.279 +
   1.280 +static force_inline void
   1.281 +negate_2x128 (__m128i  data_lo,
   1.282 +              __m128i  data_hi,
   1.283 +              __m128i* neg_lo,
   1.284 +              __m128i* neg_hi)
   1.285 +{
   1.286 +    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
   1.287 +    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
   1.288 +}
   1.289 +
   1.290 +static force_inline void
   1.291 +invert_colors_2x128 (__m128i  data_lo,
   1.292 +                     __m128i  data_hi,
   1.293 +                     __m128i* inv_lo,
   1.294 +                     __m128i* inv_hi)
   1.295 +{
   1.296 +    __m128i lo, hi;
   1.297 +
   1.298 +    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
   1.299 +    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
   1.300 +    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
   1.301 +    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
   1.302 +}
   1.303 +
   1.304 +static force_inline void
   1.305 +over_2x128 (__m128i* src_lo,
   1.306 +            __m128i* src_hi,
   1.307 +            __m128i* alpha_lo,
   1.308 +            __m128i* alpha_hi,
   1.309 +            __m128i* dst_lo,
   1.310 +            __m128i* dst_hi)
   1.311 +{
   1.312 +    __m128i t1, t2;
   1.313 +
   1.314 +    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
   1.315 +
   1.316 +    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
   1.317 +
   1.318 +    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
   1.319 +    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
   1.320 +}
   1.321 +
   1.322 +static force_inline void
   1.323 +over_rev_non_pre_2x128 (__m128i  src_lo,
   1.324 +                        __m128i  src_hi,
   1.325 +                        __m128i* dst_lo,
   1.326 +                        __m128i* dst_hi)
   1.327 +{
   1.328 +    __m128i lo, hi;
   1.329 +    __m128i alpha_lo, alpha_hi;
   1.330 +
   1.331 +    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
   1.332 +
   1.333 +    lo = _mm_or_si128 (alpha_lo, mask_alpha);
   1.334 +    hi = _mm_or_si128 (alpha_hi, mask_alpha);
   1.335 +
   1.336 +    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
   1.337 +
   1.338 +    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
   1.339 +
   1.340 +    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
   1.341 +}
   1.342 +
   1.343 +static force_inline void
   1.344 +in_over_2x128 (__m128i* src_lo,
   1.345 +               __m128i* src_hi,
   1.346 +               __m128i* alpha_lo,
   1.347 +               __m128i* alpha_hi,
   1.348 +               __m128i* mask_lo,
   1.349 +               __m128i* mask_hi,
   1.350 +               __m128i* dst_lo,
   1.351 +               __m128i* dst_hi)
   1.352 +{
   1.353 +    __m128i s_lo, s_hi;
   1.354 +    __m128i a_lo, a_hi;
   1.355 +
   1.356 +    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
   1.357 +    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
   1.358 +
   1.359 +    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
   1.360 +}
   1.361 +
   1.362 +/* load 4 pixels from a 16-byte boundary aligned address */
   1.363 +static force_inline __m128i
   1.364 +load_128_aligned (__m128i* src)
   1.365 +{
   1.366 +    return _mm_load_si128 (src);
   1.367 +}
   1.368 +
   1.369 +/* load 4 pixels from a unaligned address */
   1.370 +static force_inline __m128i
   1.371 +load_128_unaligned (const __m128i* src)
   1.372 +{
   1.373 +    return _mm_loadu_si128 (src);
   1.374 +}
   1.375 +
   1.376 +/* save 4 pixels using Write Combining memory on a 16-byte
   1.377 + * boundary aligned address
   1.378 + */
   1.379 +static force_inline void
   1.380 +save_128_write_combining (__m128i* dst,
   1.381 +                          __m128i  data)
   1.382 +{
   1.383 +    _mm_stream_si128 (dst, data);
   1.384 +}
   1.385 +
   1.386 +/* save 4 pixels on a 16-byte boundary aligned address */
   1.387 +static force_inline void
   1.388 +save_128_aligned (__m128i* dst,
   1.389 +                  __m128i  data)
   1.390 +{
   1.391 +    _mm_store_si128 (dst, data);
   1.392 +}
   1.393 +
   1.394 +/* save 4 pixels on a unaligned address */
   1.395 +static force_inline void
   1.396 +save_128_unaligned (__m128i* dst,
   1.397 +                    __m128i  data)
   1.398 +{
   1.399 +    _mm_storeu_si128 (dst, data);
   1.400 +}
   1.401 +
   1.402 +static force_inline __m128i
   1.403 +load_32_1x128 (uint32_t data)
   1.404 +{
   1.405 +    return _mm_cvtsi32_si128 (data);
   1.406 +}
   1.407 +
   1.408 +static force_inline __m128i
   1.409 +expand_alpha_rev_1x128 (__m128i data)
   1.410 +{
   1.411 +    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
   1.412 +}
   1.413 +
   1.414 +static force_inline __m128i
   1.415 +expand_pixel_8_1x128 (uint8_t data)
   1.416 +{
   1.417 +    return _mm_shufflelo_epi16 (
   1.418 +	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
   1.419 +}
   1.420 +
   1.421 +static force_inline __m128i
   1.422 +pix_multiply_1x128 (__m128i data,
   1.423 +		    __m128i alpha)
   1.424 +{
   1.425 +    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
   1.426 +					    mask_0080),
   1.427 +			    mask_0101);
   1.428 +}
   1.429 +
   1.430 +static force_inline __m128i
   1.431 +pix_add_multiply_1x128 (__m128i* src,
   1.432 +			__m128i* alpha_dst,
   1.433 +			__m128i* dst,
   1.434 +			__m128i* alpha_src)
   1.435 +{
   1.436 +    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
   1.437 +    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
   1.438 +
   1.439 +    return _mm_adds_epu8 (t1, t2);
   1.440 +}
   1.441 +
   1.442 +static force_inline __m128i
   1.443 +negate_1x128 (__m128i data)
   1.444 +{
   1.445 +    return _mm_xor_si128 (data, mask_00ff);
   1.446 +}
   1.447 +
   1.448 +static force_inline __m128i
   1.449 +invert_colors_1x128 (__m128i data)
   1.450 +{
   1.451 +    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
   1.452 +}
   1.453 +
   1.454 +static force_inline __m128i
   1.455 +over_1x128 (__m128i src, __m128i alpha, __m128i dst)
   1.456 +{
   1.457 +    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
   1.458 +}
   1.459 +
   1.460 +static force_inline __m128i
   1.461 +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
   1.462 +{
   1.463 +    return over_1x128 (pix_multiply_1x128 (*src, *mask),
   1.464 +		       pix_multiply_1x128 (*alpha, *mask),
   1.465 +		       *dst);
   1.466 +}
   1.467 +
   1.468 +static force_inline __m128i
   1.469 +over_rev_non_pre_1x128 (__m128i src, __m128i dst)
   1.470 +{
   1.471 +    __m128i alpha = expand_alpha_1x128 (src);
   1.472 +
   1.473 +    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
   1.474 +					   _mm_or_si128 (alpha, mask_alpha)),
   1.475 +		       alpha,
   1.476 +		       dst);
   1.477 +}
   1.478 +
   1.479 +static force_inline uint32_t
   1.480 +pack_1x128_32 (__m128i data)
   1.481 +{
   1.482 +    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
   1.483 +}
   1.484 +
   1.485 +static force_inline __m128i
   1.486 +expand565_16_1x128 (uint16_t pixel)
   1.487 +{
   1.488 +    __m128i m = _mm_cvtsi32_si128 (pixel);
   1.489 +
   1.490 +    m = unpack_565_to_8888 (m);
   1.491 +
   1.492 +    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
   1.493 +}
   1.494 +
   1.495 +static force_inline uint32_t
   1.496 +core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
   1.497 +{
   1.498 +    uint8_t a;
   1.499 +    __m128i xmms;
   1.500 +
   1.501 +    a = src >> 24;
   1.502 +
   1.503 +    if (a == 0xff)
   1.504 +    {
   1.505 +	return src;
   1.506 +    }
   1.507 +    else if (src)
   1.508 +    {
   1.509 +	xmms = unpack_32_1x128 (src);
   1.510 +	return pack_1x128_32 (
   1.511 +	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
   1.512 +			unpack_32_1x128 (dst)));
   1.513 +    }
   1.514 +
   1.515 +    return dst;
   1.516 +}
   1.517 +
   1.518 +static force_inline uint32_t
   1.519 +combine1 (const uint32_t *ps, const uint32_t *pm)
   1.520 +{
   1.521 +    uint32_t s = *ps;
   1.522 +
   1.523 +    if (pm)
   1.524 +    {
   1.525 +	__m128i ms, mm;
   1.526 +
   1.527 +	mm = unpack_32_1x128 (*pm);
   1.528 +	mm = expand_alpha_1x128 (mm);
   1.529 +
   1.530 +	ms = unpack_32_1x128 (s);
   1.531 +	ms = pix_multiply_1x128 (ms, mm);
   1.532 +
   1.533 +	s = pack_1x128_32 (ms);
   1.534 +    }
   1.535 +
   1.536 +    return s;
   1.537 +}
   1.538 +
   1.539 +static force_inline __m128i
   1.540 +combine4 (const __m128i *ps, const __m128i *pm)
   1.541 +{
   1.542 +    __m128i xmm_src_lo, xmm_src_hi;
   1.543 +    __m128i xmm_msk_lo, xmm_msk_hi;
   1.544 +    __m128i s;
   1.545 +
   1.546 +    if (pm)
   1.547 +    {
   1.548 +	xmm_msk_lo = load_128_unaligned (pm);
   1.549 +
   1.550 +	if (is_transparent (xmm_msk_lo))
   1.551 +	    return _mm_setzero_si128 ();
   1.552 +    }
   1.553 +
   1.554 +    s = load_128_unaligned (ps);
   1.555 +
   1.556 +    if (pm)
   1.557 +    {
   1.558 +	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
   1.559 +	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
   1.560 +
   1.561 +	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
   1.562 +
   1.563 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1.564 +			    &xmm_msk_lo, &xmm_msk_hi,
   1.565 +			    &xmm_src_lo, &xmm_src_hi);
   1.566 +
   1.567 +	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
   1.568 +    }
   1.569 +
   1.570 +    return s;
   1.571 +}
   1.572 +
   1.573 +static force_inline void
   1.574 +core_combine_over_u_sse2_mask (uint32_t *	  pd,
   1.575 +			       const uint32_t*    ps,
   1.576 +			       const uint32_t*    pm,
   1.577 +			       int                w)
   1.578 +{
   1.579 +    uint32_t s, d;
   1.580 +
   1.581 +    /* Align dst on a 16-byte boundary */
   1.582 +    while (w && ((uintptr_t)pd & 15))
   1.583 +    {
   1.584 +	d = *pd;
   1.585 +	s = combine1 (ps, pm);
   1.586 +
   1.587 +	if (s)
   1.588 +	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   1.589 +	pd++;
   1.590 +	ps++;
   1.591 +	pm++;
   1.592 +	w--;
   1.593 +    }
   1.594 +
   1.595 +    while (w >= 4)
   1.596 +    {
   1.597 +	__m128i mask = load_128_unaligned ((__m128i *)pm);
   1.598 +
   1.599 +	if (!is_zero (mask))
   1.600 +	{
   1.601 +	    __m128i src;
   1.602 +	    __m128i src_hi, src_lo;
   1.603 +	    __m128i mask_hi, mask_lo;
   1.604 +	    __m128i alpha_hi, alpha_lo;
   1.605 +
   1.606 +	    src = load_128_unaligned ((__m128i *)ps);
   1.607 +
   1.608 +	    if (is_opaque (_mm_and_si128 (src, mask)))
   1.609 +	    {
   1.610 +		save_128_aligned ((__m128i *)pd, src);
   1.611 +	    }
   1.612 +	    else
   1.613 +	    {
   1.614 +		__m128i dst = load_128_aligned ((__m128i *)pd);
   1.615 +		__m128i dst_hi, dst_lo;
   1.616 +
   1.617 +		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
   1.618 +		unpack_128_2x128 (src, &src_lo, &src_hi);
   1.619 +
   1.620 +		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
   1.621 +		pix_multiply_2x128 (&src_lo, &src_hi,
   1.622 +				    &mask_lo, &mask_hi,
   1.623 +				    &src_lo, &src_hi);
   1.624 +
   1.625 +		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
   1.626 +
   1.627 +		expand_alpha_2x128 (src_lo, src_hi,
   1.628 +				    &alpha_lo, &alpha_hi);
   1.629 +
   1.630 +		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
   1.631 +			    &dst_lo, &dst_hi);
   1.632 +
   1.633 +		save_128_aligned (
   1.634 +		    (__m128i *)pd,
   1.635 +		    pack_2x128_128 (dst_lo, dst_hi));
   1.636 +	    }
   1.637 +	}
   1.638 +
   1.639 +	pm += 4;
   1.640 +	ps += 4;
   1.641 +	pd += 4;
   1.642 +	w -= 4;
   1.643 +    }
   1.644 +    while (w)
   1.645 +    {
   1.646 +	d = *pd;
   1.647 +	s = combine1 (ps, pm);
   1.648 +
   1.649 +	if (s)
   1.650 +	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   1.651 +	pd++;
   1.652 +	ps++;
   1.653 +	pm++;
   1.654 +
   1.655 +	w--;
   1.656 +    }
   1.657 +}
   1.658 +
   1.659 +static force_inline void
   1.660 +core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
   1.661 +				  const uint32_t*    ps,
   1.662 +				  int                w)
   1.663 +{
   1.664 +    uint32_t s, d;
   1.665 +
   1.666 +    /* Align dst on a 16-byte boundary */
   1.667 +    while (w && ((uintptr_t)pd & 15))
   1.668 +    {
   1.669 +	d = *pd;
   1.670 +	s = *ps;
   1.671 +
   1.672 +	if (s)
   1.673 +	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   1.674 +	pd++;
   1.675 +	ps++;
   1.676 +	w--;
   1.677 +    }
   1.678 +
   1.679 +    while (w >= 4)
   1.680 +    {
   1.681 +	__m128i src;
   1.682 +	__m128i src_hi, src_lo, dst_hi, dst_lo;
   1.683 +	__m128i alpha_hi, alpha_lo;
   1.684 +
   1.685 +	src = load_128_unaligned ((__m128i *)ps);
   1.686 +
   1.687 +	if (!is_zero (src))
   1.688 +	{
   1.689 +	    if (is_opaque (src))
   1.690 +	    {
   1.691 +		save_128_aligned ((__m128i *)pd, src);
   1.692 +	    }
   1.693 +	    else
   1.694 +	    {
   1.695 +		__m128i dst = load_128_aligned ((__m128i *)pd);
   1.696 +
   1.697 +		unpack_128_2x128 (src, &src_lo, &src_hi);
   1.698 +		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
   1.699 +
   1.700 +		expand_alpha_2x128 (src_lo, src_hi,
   1.701 +				    &alpha_lo, &alpha_hi);
   1.702 +		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
   1.703 +			    &dst_lo, &dst_hi);
   1.704 +
   1.705 +		save_128_aligned (
   1.706 +		    (__m128i *)pd,
   1.707 +		    pack_2x128_128 (dst_lo, dst_hi));
   1.708 +	    }
   1.709 +	}
   1.710 +
   1.711 +	ps += 4;
   1.712 +	pd += 4;
   1.713 +	w -= 4;
   1.714 +    }
   1.715 +    while (w)
   1.716 +    {
   1.717 +	d = *pd;
   1.718 +	s = *ps;
   1.719 +
   1.720 +	if (s)
   1.721 +	    *pd = core_combine_over_u_pixel_sse2 (s, d);
   1.722 +	pd++;
   1.723 +	ps++;
   1.724 +
   1.725 +	w--;
   1.726 +    }
   1.727 +}
   1.728 +
   1.729 +static force_inline void
   1.730 +sse2_combine_over_u (pixman_implementation_t *imp,
   1.731 +                     pixman_op_t              op,
   1.732 +                     uint32_t *               pd,
   1.733 +                     const uint32_t *         ps,
   1.734 +                     const uint32_t *         pm,
   1.735 +                     int                      w)
   1.736 +{
   1.737 +    if (pm)
   1.738 +	core_combine_over_u_sse2_mask (pd, ps, pm, w);
   1.739 +    else
   1.740 +	core_combine_over_u_sse2_no_mask (pd, ps, w);
   1.741 +}
   1.742 +
   1.743 +static void
   1.744 +sse2_combine_over_reverse_u (pixman_implementation_t *imp,
   1.745 +                             pixman_op_t              op,
   1.746 +                             uint32_t *               pd,
   1.747 +                             const uint32_t *         ps,
   1.748 +                             const uint32_t *         pm,
   1.749 +                             int                      w)
   1.750 +{
   1.751 +    uint32_t s, d;
   1.752 +
   1.753 +    __m128i xmm_dst_lo, xmm_dst_hi;
   1.754 +    __m128i xmm_src_lo, xmm_src_hi;
   1.755 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
   1.756 +
   1.757 +    /* Align dst on a 16-byte boundary */
   1.758 +    while (w &&
   1.759 +           ((uintptr_t)pd & 15))
   1.760 +    {
   1.761 +	d = *pd;
   1.762 +	s = combine1 (ps, pm);
   1.763 +
   1.764 +	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
   1.765 +	w--;
   1.766 +	ps++;
   1.767 +	if (pm)
   1.768 +	    pm++;
   1.769 +    }
   1.770 +
   1.771 +    while (w >= 4)
   1.772 +    {
   1.773 +	/* I'm loading unaligned because I'm not sure
   1.774 +	 * about the address alignment.
   1.775 +	 */
   1.776 +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   1.777 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1.778 +
   1.779 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.780 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1.781 +
   1.782 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
   1.783 +			    &xmm_alpha_lo, &xmm_alpha_hi);
   1.784 +
   1.785 +	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1.786 +		    &xmm_alpha_lo, &xmm_alpha_hi,
   1.787 +		    &xmm_src_lo, &xmm_src_hi);
   1.788 +
   1.789 +	/* rebuid the 4 pixel data and save*/
   1.790 +	save_128_aligned ((__m128i*)pd,
   1.791 +			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
   1.792 +
   1.793 +	w -= 4;
   1.794 +	ps += 4;
   1.795 +	pd += 4;
   1.796 +
   1.797 +	if (pm)
   1.798 +	    pm += 4;
   1.799 +    }
   1.800 +
   1.801 +    while (w)
   1.802 +    {
   1.803 +	d = *pd;
   1.804 +	s = combine1 (ps, pm);
   1.805 +
   1.806 +	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
   1.807 +	ps++;
   1.808 +	w--;
   1.809 +	if (pm)
   1.810 +	    pm++;
   1.811 +    }
   1.812 +}
   1.813 +
   1.814 +static force_inline uint32_t
   1.815 +core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
   1.816 +{
   1.817 +    uint32_t maska = src >> 24;
   1.818 +
   1.819 +    if (maska == 0)
   1.820 +    {
   1.821 +	return 0;
   1.822 +    }
   1.823 +    else if (maska != 0xff)
   1.824 +    {
   1.825 +	return pack_1x128_32 (
   1.826 +	    pix_multiply_1x128 (unpack_32_1x128 (dst),
   1.827 +				expand_alpha_1x128 (unpack_32_1x128 (src))));
   1.828 +    }
   1.829 +
   1.830 +    return dst;
   1.831 +}
   1.832 +
   1.833 +static void
   1.834 +sse2_combine_in_u (pixman_implementation_t *imp,
   1.835 +                   pixman_op_t              op,
   1.836 +                   uint32_t *               pd,
   1.837 +                   const uint32_t *         ps,
   1.838 +                   const uint32_t *         pm,
   1.839 +                   int                      w)
   1.840 +{
   1.841 +    uint32_t s, d;
   1.842 +
   1.843 +    __m128i xmm_src_lo, xmm_src_hi;
   1.844 +    __m128i xmm_dst_lo, xmm_dst_hi;
   1.845 +
   1.846 +    while (w && ((uintptr_t)pd & 15))
   1.847 +    {
   1.848 +	s = combine1 (ps, pm);
   1.849 +	d = *pd;
   1.850 +
   1.851 +	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
   1.852 +	w--;
   1.853 +	ps++;
   1.854 +	if (pm)
   1.855 +	    pm++;
   1.856 +    }
   1.857 +
   1.858 +    while (w >= 4)
   1.859 +    {
   1.860 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1.861 +	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
   1.862 +
   1.863 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1.864 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1.865 +
   1.866 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.867 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
   1.868 +			    &xmm_dst_lo, &xmm_dst_hi,
   1.869 +			    &xmm_dst_lo, &xmm_dst_hi);
   1.870 +
   1.871 +	save_128_aligned ((__m128i*)pd,
   1.872 +			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1.873 +
   1.874 +	ps += 4;
   1.875 +	pd += 4;
   1.876 +	w -= 4;
   1.877 +	if (pm)
   1.878 +	    pm += 4;
   1.879 +    }
   1.880 +
   1.881 +    while (w)
   1.882 +    {
   1.883 +	s = combine1 (ps, pm);
   1.884 +	d = *pd;
   1.885 +
   1.886 +	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
   1.887 +	w--;
   1.888 +	ps++;
   1.889 +	if (pm)
   1.890 +	    pm++;
   1.891 +    }
   1.892 +}
   1.893 +
   1.894 +static void
   1.895 +sse2_combine_in_reverse_u (pixman_implementation_t *imp,
   1.896 +                           pixman_op_t              op,
   1.897 +                           uint32_t *               pd,
   1.898 +                           const uint32_t *         ps,
   1.899 +                           const uint32_t *         pm,
   1.900 +                           int                      w)
   1.901 +{
   1.902 +    uint32_t s, d;
   1.903 +
   1.904 +    __m128i xmm_src_lo, xmm_src_hi;
   1.905 +    __m128i xmm_dst_lo, xmm_dst_hi;
   1.906 +
   1.907 +    while (w && ((uintptr_t)pd & 15))
   1.908 +    {
   1.909 +	s = combine1 (ps, pm);
   1.910 +	d = *pd;
   1.911 +
   1.912 +	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
   1.913 +	ps++;
   1.914 +	w--;
   1.915 +	if (pm)
   1.916 +	    pm++;
   1.917 +    }
   1.918 +
   1.919 +    while (w >= 4)
   1.920 +    {
   1.921 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1.922 +	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
   1.923 +
   1.924 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.925 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.926 +
   1.927 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1.928 +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1.929 +			    &xmm_src_lo, &xmm_src_hi,
   1.930 +			    &xmm_dst_lo, &xmm_dst_hi);
   1.931 +
   1.932 +	save_128_aligned (
   1.933 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1.934 +
   1.935 +	ps += 4;
   1.936 +	pd += 4;
   1.937 +	w -= 4;
   1.938 +	if (pm)
   1.939 +	    pm += 4;
   1.940 +    }
   1.941 +
   1.942 +    while (w)
   1.943 +    {
   1.944 +	s = combine1 (ps, pm);
   1.945 +	d = *pd;
   1.946 +
   1.947 +	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
   1.948 +	w--;
   1.949 +	ps++;
   1.950 +	if (pm)
   1.951 +	    pm++;
   1.952 +    }
   1.953 +}
   1.954 +
   1.955 +static void
   1.956 +sse2_combine_out_reverse_u (pixman_implementation_t *imp,
   1.957 +                            pixman_op_t              op,
   1.958 +                            uint32_t *               pd,
   1.959 +                            const uint32_t *         ps,
   1.960 +                            const uint32_t *         pm,
   1.961 +                            int                      w)
   1.962 +{
   1.963 +    while (w && ((uintptr_t)pd & 15))
   1.964 +    {
   1.965 +	uint32_t s = combine1 (ps, pm);
   1.966 +	uint32_t d = *pd;
   1.967 +
   1.968 +	*pd++ = pack_1x128_32 (
   1.969 +	    pix_multiply_1x128 (
   1.970 +		unpack_32_1x128 (d), negate_1x128 (
   1.971 +		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
   1.972 +
   1.973 +	if (pm)
   1.974 +	    pm++;
   1.975 +	ps++;
   1.976 +	w--;
   1.977 +    }
   1.978 +
   1.979 +    while (w >= 4)
   1.980 +    {
   1.981 +	__m128i xmm_src_lo, xmm_src_hi;
   1.982 +	__m128i xmm_dst_lo, xmm_dst_hi;
   1.983 +
   1.984 +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
   1.985 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
   1.986 +
   1.987 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.988 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
   1.989 +
   1.990 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.991 +	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
   1.992 +
   1.993 +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
   1.994 +			    &xmm_src_lo, &xmm_src_hi,
   1.995 +			    &xmm_dst_lo, &xmm_dst_hi);
   1.996 +
   1.997 +	save_128_aligned (
   1.998 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
   1.999 +
  1.1000 +	ps += 4;
  1.1001 +	pd += 4;
  1.1002 +	if (pm)
  1.1003 +	    pm += 4;
  1.1004 +
  1.1005 +	w -= 4;
  1.1006 +    }
  1.1007 +
  1.1008 +    while (w)
  1.1009 +    {
  1.1010 +	uint32_t s = combine1 (ps, pm);
  1.1011 +	uint32_t d = *pd;
  1.1012 +
  1.1013 +	*pd++ = pack_1x128_32 (
  1.1014 +	    pix_multiply_1x128 (
  1.1015 +		unpack_32_1x128 (d), negate_1x128 (
  1.1016 +		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
  1.1017 +	ps++;
  1.1018 +	if (pm)
  1.1019 +	    pm++;
  1.1020 +	w--;
  1.1021 +    }
  1.1022 +}
  1.1023 +
  1.1024 +static void
  1.1025 +sse2_combine_out_u (pixman_implementation_t *imp,
  1.1026 +                    pixman_op_t              op,
  1.1027 +                    uint32_t *               pd,
  1.1028 +                    const uint32_t *         ps,
  1.1029 +                    const uint32_t *         pm,
  1.1030 +                    int                      w)
  1.1031 +{
  1.1032 +    while (w && ((uintptr_t)pd & 15))
  1.1033 +    {
  1.1034 +	uint32_t s = combine1 (ps, pm);
  1.1035 +	uint32_t d = *pd;
  1.1036 +
  1.1037 +	*pd++ = pack_1x128_32 (
  1.1038 +	    pix_multiply_1x128 (
  1.1039 +		unpack_32_1x128 (s), negate_1x128 (
  1.1040 +		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1.1041 +	w--;
  1.1042 +	ps++;
  1.1043 +	if (pm)
  1.1044 +	    pm++;
  1.1045 +    }
  1.1046 +
  1.1047 +    while (w >= 4)
  1.1048 +    {
  1.1049 +	__m128i xmm_src_lo, xmm_src_hi;
  1.1050 +	__m128i xmm_dst_lo, xmm_dst_hi;
  1.1051 +
  1.1052 +	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
  1.1053 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1.1054 +
  1.1055 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1056 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1057 +
  1.1058 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1059 +	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1060 +
  1.1061 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.1062 +			    &xmm_dst_lo, &xmm_dst_hi,
  1.1063 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1064 +
  1.1065 +	save_128_aligned (
  1.1066 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1067 +
  1.1068 +	ps += 4;
  1.1069 +	pd += 4;
  1.1070 +	w -= 4;
  1.1071 +	if (pm)
  1.1072 +	    pm += 4;
  1.1073 +    }
  1.1074 +
  1.1075 +    while (w)
  1.1076 +    {
  1.1077 +	uint32_t s = combine1 (ps, pm);
  1.1078 +	uint32_t d = *pd;
  1.1079 +
  1.1080 +	*pd++ = pack_1x128_32 (
  1.1081 +	    pix_multiply_1x128 (
  1.1082 +		unpack_32_1x128 (s), negate_1x128 (
  1.1083 +		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1.1084 +	w--;
  1.1085 +	ps++;
  1.1086 +	if (pm)
  1.1087 +	    pm++;
  1.1088 +    }
  1.1089 +}
  1.1090 +
  1.1091 +static force_inline uint32_t
  1.1092 +core_combine_atop_u_pixel_sse2 (uint32_t src,
  1.1093 +                                uint32_t dst)
  1.1094 +{
  1.1095 +    __m128i s = unpack_32_1x128 (src);
  1.1096 +    __m128i d = unpack_32_1x128 (dst);
  1.1097 +
  1.1098 +    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
  1.1099 +    __m128i da = expand_alpha_1x128 (d);
  1.1100 +
  1.1101 +    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
  1.1102 +}
  1.1103 +
  1.1104 +static void
  1.1105 +sse2_combine_atop_u (pixman_implementation_t *imp,
  1.1106 +                     pixman_op_t              op,
  1.1107 +                     uint32_t *               pd,
  1.1108 +                     const uint32_t *         ps,
  1.1109 +                     const uint32_t *         pm,
  1.1110 +                     int                      w)
  1.1111 +{
  1.1112 +    uint32_t s, d;
  1.1113 +
  1.1114 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1115 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1116 +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1.1117 +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1.1118 +
  1.1119 +    while (w && ((uintptr_t)pd & 15))
  1.1120 +    {
  1.1121 +	s = combine1 (ps, pm);
  1.1122 +	d = *pd;
  1.1123 +
  1.1124 +	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
  1.1125 +	w--;
  1.1126 +	ps++;
  1.1127 +	if (pm)
  1.1128 +	    pm++;
  1.1129 +    }
  1.1130 +
  1.1131 +    while (w >= 4)
  1.1132 +    {
  1.1133 +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
  1.1134 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1.1135 +
  1.1136 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1137 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1138 +
  1.1139 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.1140 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.1141 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.1142 +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.1143 +
  1.1144 +	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
  1.1145 +		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.1146 +
  1.1147 +	pix_add_multiply_2x128 (
  1.1148 +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1.1149 +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1.1150 +	    &xmm_dst_lo, &xmm_dst_hi);
  1.1151 +
  1.1152 +	save_128_aligned (
  1.1153 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1154 +
  1.1155 +	ps += 4;
  1.1156 +	pd += 4;
  1.1157 +	w -= 4;
  1.1158 +	if (pm)
  1.1159 +	    pm += 4;
  1.1160 +    }
  1.1161 +
  1.1162 +    while (w)
  1.1163 +    {
  1.1164 +	s = combine1 (ps, pm);
  1.1165 +	d = *pd;
  1.1166 +
  1.1167 +	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
  1.1168 +	w--;
  1.1169 +	ps++;
  1.1170 +	if (pm)
  1.1171 +	    pm++;
  1.1172 +    }
  1.1173 +}
  1.1174 +
  1.1175 +static force_inline uint32_t
  1.1176 +core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
  1.1177 +                                        uint32_t dst)
  1.1178 +{
  1.1179 +    __m128i s = unpack_32_1x128 (src);
  1.1180 +    __m128i d = unpack_32_1x128 (dst);
  1.1181 +
  1.1182 +    __m128i sa = expand_alpha_1x128 (s);
  1.1183 +    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
  1.1184 +
  1.1185 +    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
  1.1186 +}
  1.1187 +
  1.1188 +static void
  1.1189 +sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
  1.1190 +                             pixman_op_t              op,
  1.1191 +                             uint32_t *               pd,
  1.1192 +                             const uint32_t *         ps,
  1.1193 +                             const uint32_t *         pm,
  1.1194 +                             int                      w)
  1.1195 +{
  1.1196 +    uint32_t s, d;
  1.1197 +
  1.1198 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1199 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1200 +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1.1201 +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1.1202 +
  1.1203 +    while (w && ((uintptr_t)pd & 15))
  1.1204 +    {
  1.1205 +	s = combine1 (ps, pm);
  1.1206 +	d = *pd;
  1.1207 +
  1.1208 +	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
  1.1209 +	ps++;
  1.1210 +	w--;
  1.1211 +	if (pm)
  1.1212 +	    pm++;
  1.1213 +    }
  1.1214 +
  1.1215 +    while (w >= 4)
  1.1216 +    {
  1.1217 +	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
  1.1218 +	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1.1219 +
  1.1220 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1221 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1222 +
  1.1223 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.1224 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.1225 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.1226 +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.1227 +
  1.1228 +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  1.1229 +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.1230 +
  1.1231 +	pix_add_multiply_2x128 (
  1.1232 +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1.1233 +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1.1234 +	    &xmm_dst_lo, &xmm_dst_hi);
  1.1235 +
  1.1236 +	save_128_aligned (
  1.1237 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1238 +
  1.1239 +	ps += 4;
  1.1240 +	pd += 4;
  1.1241 +	w -= 4;
  1.1242 +	if (pm)
  1.1243 +	    pm += 4;
  1.1244 +    }
  1.1245 +
  1.1246 +    while (w)
  1.1247 +    {
  1.1248 +	s = combine1 (ps, pm);
  1.1249 +	d = *pd;
  1.1250 +
  1.1251 +	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
  1.1252 +	ps++;
  1.1253 +	w--;
  1.1254 +	if (pm)
  1.1255 +	    pm++;
  1.1256 +    }
  1.1257 +}
  1.1258 +
  1.1259 +static force_inline uint32_t
  1.1260 +core_combine_xor_u_pixel_sse2 (uint32_t src,
  1.1261 +                               uint32_t dst)
  1.1262 +{
  1.1263 +    __m128i s = unpack_32_1x128 (src);
  1.1264 +    __m128i d = unpack_32_1x128 (dst);
  1.1265 +
  1.1266 +    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
  1.1267 +    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
  1.1268 +
  1.1269 +    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
  1.1270 +}
  1.1271 +
  1.1272 +static void
  1.1273 +sse2_combine_xor_u (pixman_implementation_t *imp,
  1.1274 +                    pixman_op_t              op,
  1.1275 +                    uint32_t *               dst,
  1.1276 +                    const uint32_t *         src,
  1.1277 +                    const uint32_t *         mask,
  1.1278 +                    int                      width)
  1.1279 +{
  1.1280 +    int w = width;
  1.1281 +    uint32_t s, d;
  1.1282 +    uint32_t* pd = dst;
  1.1283 +    const uint32_t* ps = src;
  1.1284 +    const uint32_t* pm = mask;
  1.1285 +
  1.1286 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.1287 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.1288 +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1.1289 +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1.1290 +
  1.1291 +    while (w && ((uintptr_t)pd & 15))
  1.1292 +    {
  1.1293 +	s = combine1 (ps, pm);
  1.1294 +	d = *pd;
  1.1295 +
  1.1296 +	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
  1.1297 +	w--;
  1.1298 +	ps++;
  1.1299 +	if (pm)
  1.1300 +	    pm++;
  1.1301 +    }
  1.1302 +
  1.1303 +    while (w >= 4)
  1.1304 +    {
  1.1305 +	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
  1.1306 +	xmm_dst = load_128_aligned ((__m128i*) pd);
  1.1307 +
  1.1308 +	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.1309 +	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.1310 +
  1.1311 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.1312 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.1313 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.1314 +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.1315 +
  1.1316 +	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
  1.1317 +		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.1318 +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  1.1319 +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.1320 +
  1.1321 +	pix_add_multiply_2x128 (
  1.1322 +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1.1323 +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1.1324 +	    &xmm_dst_lo, &xmm_dst_hi);
  1.1325 +
  1.1326 +	save_128_aligned (
  1.1327 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1328 +
  1.1329 +	ps += 4;
  1.1330 +	pd += 4;
  1.1331 +	w -= 4;
  1.1332 +	if (pm)
  1.1333 +	    pm += 4;
  1.1334 +    }
  1.1335 +
  1.1336 +    while (w)
  1.1337 +    {
  1.1338 +	s = combine1 (ps, pm);
  1.1339 +	d = *pd;
  1.1340 +
  1.1341 +	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
  1.1342 +	w--;
  1.1343 +	ps++;
  1.1344 +	if (pm)
  1.1345 +	    pm++;
  1.1346 +    }
  1.1347 +}
  1.1348 +
  1.1349 +static force_inline void
  1.1350 +sse2_combine_add_u (pixman_implementation_t *imp,
  1.1351 +                    pixman_op_t              op,
  1.1352 +                    uint32_t *               dst,
  1.1353 +                    const uint32_t *         src,
  1.1354 +                    const uint32_t *         mask,
  1.1355 +                    int                      width)
  1.1356 +{
  1.1357 +    int w = width;
  1.1358 +    uint32_t s, d;
  1.1359 +    uint32_t* pd = dst;
  1.1360 +    const uint32_t* ps = src;
  1.1361 +    const uint32_t* pm = mask;
  1.1362 +
  1.1363 +    while (w && (uintptr_t)pd & 15)
  1.1364 +    {
  1.1365 +	s = combine1 (ps, pm);
  1.1366 +	d = *pd;
  1.1367 +
  1.1368 +	ps++;
  1.1369 +	if (pm)
  1.1370 +	    pm++;
  1.1371 +	*pd++ = _mm_cvtsi128_si32 (
  1.1372 +	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
  1.1373 +	w--;
  1.1374 +    }
  1.1375 +
  1.1376 +    while (w >= 4)
  1.1377 +    {
  1.1378 +	__m128i s;
  1.1379 +
  1.1380 +	s = combine4 ((__m128i*)ps, (__m128i*)pm);
  1.1381 +
  1.1382 +	save_128_aligned (
  1.1383 +	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
  1.1384 +
  1.1385 +	pd += 4;
  1.1386 +	ps += 4;
  1.1387 +	if (pm)
  1.1388 +	    pm += 4;
  1.1389 +	w -= 4;
  1.1390 +    }
  1.1391 +
  1.1392 +    while (w--)
  1.1393 +    {
  1.1394 +	s = combine1 (ps, pm);
  1.1395 +	d = *pd;
  1.1396 +
  1.1397 +	ps++;
  1.1398 +	*pd++ = _mm_cvtsi128_si32 (
  1.1399 +	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
  1.1400 +	if (pm)
  1.1401 +	    pm++;
  1.1402 +    }
  1.1403 +}
  1.1404 +
  1.1405 +static force_inline uint32_t
  1.1406 +core_combine_saturate_u_pixel_sse2 (uint32_t src,
  1.1407 +                                    uint32_t dst)
  1.1408 +{
  1.1409 +    __m128i ms = unpack_32_1x128 (src);
  1.1410 +    __m128i md = unpack_32_1x128 (dst);
  1.1411 +    uint32_t sa = src >> 24;
  1.1412 +    uint32_t da = ~dst >> 24;
  1.1413 +
  1.1414 +    if (sa > da)
  1.1415 +    {
  1.1416 +	ms = pix_multiply_1x128 (
  1.1417 +	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
  1.1418 +    }
  1.1419 +
  1.1420 +    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
  1.1421 +}
  1.1422 +
  1.1423 +static void
  1.1424 +sse2_combine_saturate_u (pixman_implementation_t *imp,
  1.1425 +                         pixman_op_t              op,
  1.1426 +                         uint32_t *               pd,
  1.1427 +                         const uint32_t *         ps,
  1.1428 +                         const uint32_t *         pm,
  1.1429 +                         int                      w)
  1.1430 +{
  1.1431 +    uint32_t s, d;
  1.1432 +
  1.1433 +    uint32_t pack_cmp;
  1.1434 +    __m128i xmm_src, xmm_dst;
  1.1435 +
  1.1436 +    while (w && (uintptr_t)pd & 15)
  1.1437 +    {
  1.1438 +	s = combine1 (ps, pm);
  1.1439 +	d = *pd;
  1.1440 +
  1.1441 +	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1.1442 +	w--;
  1.1443 +	ps++;
  1.1444 +	if (pm)
  1.1445 +	    pm++;
  1.1446 +    }
  1.1447 +
  1.1448 +    while (w >= 4)
  1.1449 +    {
  1.1450 +	xmm_dst = load_128_aligned  ((__m128i*)pd);
  1.1451 +	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
  1.1452 +
  1.1453 +	pack_cmp = _mm_movemask_epi8 (
  1.1454 +	    _mm_cmpgt_epi32 (
  1.1455 +		_mm_srli_epi32 (xmm_src, 24),
  1.1456 +		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
  1.1457 +
  1.1458 +	/* if some alpha src is grater than respective ~alpha dst */
  1.1459 +	if (pack_cmp)
  1.1460 +	{
  1.1461 +	    s = combine1 (ps++, pm);
  1.1462 +	    d = *pd;
  1.1463 +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1.1464 +	    if (pm)
  1.1465 +		pm++;
  1.1466 +
  1.1467 +	    s = combine1 (ps++, pm);
  1.1468 +	    d = *pd;
  1.1469 +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1.1470 +	    if (pm)
  1.1471 +		pm++;
  1.1472 +
  1.1473 +	    s = combine1 (ps++, pm);
  1.1474 +	    d = *pd;
  1.1475 +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1.1476 +	    if (pm)
  1.1477 +		pm++;
  1.1478 +
  1.1479 +	    s = combine1 (ps++, pm);
  1.1480 +	    d = *pd;
  1.1481 +	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1.1482 +	    if (pm)
  1.1483 +		pm++;
  1.1484 +	}
  1.1485 +	else
  1.1486 +	{
  1.1487 +	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
  1.1488 +
  1.1489 +	    pd += 4;
  1.1490 +	    ps += 4;
  1.1491 +	    if (pm)
  1.1492 +		pm += 4;
  1.1493 +	}
  1.1494 +
  1.1495 +	w -= 4;
  1.1496 +    }
  1.1497 +
  1.1498 +    while (w--)
  1.1499 +    {
  1.1500 +	s = combine1 (ps, pm);
  1.1501 +	d = *pd;
  1.1502 +
  1.1503 +	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
  1.1504 +	ps++;
  1.1505 +	if (pm)
  1.1506 +	    pm++;
  1.1507 +    }
  1.1508 +}
  1.1509 +
  1.1510 +static void
  1.1511 +sse2_combine_src_ca (pixman_implementation_t *imp,
  1.1512 +                     pixman_op_t              op,
  1.1513 +                     uint32_t *               pd,
  1.1514 +                     const uint32_t *         ps,
  1.1515 +                     const uint32_t *         pm,
  1.1516 +                     int                      w)
  1.1517 +{
  1.1518 +    uint32_t s, m;
  1.1519 +
  1.1520 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1521 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1522 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1523 +
  1.1524 +    while (w && (uintptr_t)pd & 15)
  1.1525 +    {
  1.1526 +	s = *ps++;
  1.1527 +	m = *pm++;
  1.1528 +	*pd++ = pack_1x128_32 (
  1.1529 +	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
  1.1530 +	w--;
  1.1531 +    }
  1.1532 +
  1.1533 +    while (w >= 4)
  1.1534 +    {
  1.1535 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1536 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1537 +
  1.1538 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1539 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1540 +
  1.1541 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.1542 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.1543 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1544 +
  1.1545 +	save_128_aligned (
  1.1546 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1547 +
  1.1548 +	ps += 4;
  1.1549 +	pd += 4;
  1.1550 +	pm += 4;
  1.1551 +	w -= 4;
  1.1552 +    }
  1.1553 +
  1.1554 +    while (w)
  1.1555 +    {
  1.1556 +	s = *ps++;
  1.1557 +	m = *pm++;
  1.1558 +	*pd++ = pack_1x128_32 (
  1.1559 +	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
  1.1560 +	w--;
  1.1561 +    }
  1.1562 +}
  1.1563 +
  1.1564 +static force_inline uint32_t
  1.1565 +core_combine_over_ca_pixel_sse2 (uint32_t src,
  1.1566 +                                 uint32_t mask,
  1.1567 +                                 uint32_t dst)
  1.1568 +{
  1.1569 +    __m128i s = unpack_32_1x128 (src);
  1.1570 +    __m128i expAlpha = expand_alpha_1x128 (s);
  1.1571 +    __m128i unpk_mask = unpack_32_1x128 (mask);
  1.1572 +    __m128i unpk_dst  = unpack_32_1x128 (dst);
  1.1573 +
  1.1574 +    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
  1.1575 +}
  1.1576 +
  1.1577 +static void
  1.1578 +sse2_combine_over_ca (pixman_implementation_t *imp,
  1.1579 +                      pixman_op_t              op,
  1.1580 +                      uint32_t *               pd,
  1.1581 +                      const uint32_t *         ps,
  1.1582 +                      const uint32_t *         pm,
  1.1583 +                      int                      w)
  1.1584 +{
  1.1585 +    uint32_t s, m, d;
  1.1586 +
  1.1587 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.1588 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1589 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1590 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1591 +
  1.1592 +    while (w && (uintptr_t)pd & 15)
  1.1593 +    {
  1.1594 +	s = *ps++;
  1.1595 +	m = *pm++;
  1.1596 +	d = *pd;
  1.1597 +
  1.1598 +	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
  1.1599 +	w--;
  1.1600 +    }
  1.1601 +
  1.1602 +    while (w >= 4)
  1.1603 +    {
  1.1604 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.1605 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1606 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1607 +
  1.1608 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1609 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1610 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1611 +
  1.1612 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.1613 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1614 +
  1.1615 +	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.1616 +		       &xmm_alpha_lo, &xmm_alpha_hi,
  1.1617 +		       &xmm_mask_lo, &xmm_mask_hi,
  1.1618 +		       &xmm_dst_lo, &xmm_dst_hi);
  1.1619 +
  1.1620 +	save_128_aligned (
  1.1621 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1622 +
  1.1623 +	ps += 4;
  1.1624 +	pd += 4;
  1.1625 +	pm += 4;
  1.1626 +	w -= 4;
  1.1627 +    }
  1.1628 +
  1.1629 +    while (w)
  1.1630 +    {
  1.1631 +	s = *ps++;
  1.1632 +	m = *pm++;
  1.1633 +	d = *pd;
  1.1634 +
  1.1635 +	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
  1.1636 +	w--;
  1.1637 +    }
  1.1638 +}
  1.1639 +
  1.1640 +static force_inline uint32_t
  1.1641 +core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
  1.1642 +                                         uint32_t mask,
  1.1643 +                                         uint32_t dst)
  1.1644 +{
  1.1645 +    __m128i d = unpack_32_1x128 (dst);
  1.1646 +
  1.1647 +    return pack_1x128_32 (
  1.1648 +	over_1x128 (d, expand_alpha_1x128 (d),
  1.1649 +		    pix_multiply_1x128 (unpack_32_1x128 (src),
  1.1650 +					unpack_32_1x128 (mask))));
  1.1651 +}
  1.1652 +
  1.1653 +static void
  1.1654 +sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
  1.1655 +                              pixman_op_t              op,
  1.1656 +                              uint32_t *               pd,
  1.1657 +                              const uint32_t *         ps,
  1.1658 +                              const uint32_t *         pm,
  1.1659 +                              int                      w)
  1.1660 +{
  1.1661 +    uint32_t s, m, d;
  1.1662 +
  1.1663 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.1664 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1665 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1666 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1667 +
  1.1668 +    while (w && (uintptr_t)pd & 15)
  1.1669 +    {
  1.1670 +	s = *ps++;
  1.1671 +	m = *pm++;
  1.1672 +	d = *pd;
  1.1673 +
  1.1674 +	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
  1.1675 +	w--;
  1.1676 +    }
  1.1677 +
  1.1678 +    while (w >= 4)
  1.1679 +    {
  1.1680 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.1681 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1682 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1683 +
  1.1684 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1685 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1686 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1687 +
  1.1688 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.1689 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1690 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.1691 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.1692 +			    &xmm_mask_lo, &xmm_mask_hi);
  1.1693 +
  1.1694 +	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1.1695 +		    &xmm_alpha_lo, &xmm_alpha_hi,
  1.1696 +		    &xmm_mask_lo, &xmm_mask_hi);
  1.1697 +
  1.1698 +	save_128_aligned (
  1.1699 +	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
  1.1700 +
  1.1701 +	ps += 4;
  1.1702 +	pd += 4;
  1.1703 +	pm += 4;
  1.1704 +	w -= 4;
  1.1705 +    }
  1.1706 +
  1.1707 +    while (w)
  1.1708 +    {
  1.1709 +	s = *ps++;
  1.1710 +	m = *pm++;
  1.1711 +	d = *pd;
  1.1712 +
  1.1713 +	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
  1.1714 +	w--;
  1.1715 +    }
  1.1716 +}
  1.1717 +
  1.1718 +static void
  1.1719 +sse2_combine_in_ca (pixman_implementation_t *imp,
  1.1720 +                    pixman_op_t              op,
  1.1721 +                    uint32_t *               pd,
  1.1722 +                    const uint32_t *         ps,
  1.1723 +                    const uint32_t *         pm,
  1.1724 +                    int                      w)
  1.1725 +{
  1.1726 +    uint32_t s, m, d;
  1.1727 +
  1.1728 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.1729 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1730 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1731 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1732 +
  1.1733 +    while (w && (uintptr_t)pd & 15)
  1.1734 +    {
  1.1735 +	s = *ps++;
  1.1736 +	m = *pm++;
  1.1737 +	d = *pd;
  1.1738 +
  1.1739 +	*pd++ = pack_1x128_32 (
  1.1740 +	    pix_multiply_1x128 (
  1.1741 +		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1.1742 +		expand_alpha_1x128 (unpack_32_1x128 (d))));
  1.1743 +
  1.1744 +	w--;
  1.1745 +    }
  1.1746 +
  1.1747 +    while (w >= 4)
  1.1748 +    {
  1.1749 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.1750 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1751 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1752 +
  1.1753 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1754 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1755 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1756 +
  1.1757 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.1758 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1759 +
  1.1760 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.1761 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.1762 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1763 +
  1.1764 +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1.1765 +			    &xmm_alpha_lo, &xmm_alpha_hi,
  1.1766 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1767 +
  1.1768 +	save_128_aligned (
  1.1769 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1770 +
  1.1771 +	ps += 4;
  1.1772 +	pd += 4;
  1.1773 +	pm += 4;
  1.1774 +	w -= 4;
  1.1775 +    }
  1.1776 +
  1.1777 +    while (w)
  1.1778 +    {
  1.1779 +	s = *ps++;
  1.1780 +	m = *pm++;
  1.1781 +	d = *pd;
  1.1782 +
  1.1783 +	*pd++ = pack_1x128_32 (
  1.1784 +	    pix_multiply_1x128 (
  1.1785 +		pix_multiply_1x128 (
  1.1786 +		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1.1787 +		expand_alpha_1x128 (unpack_32_1x128 (d))));
  1.1788 +
  1.1789 +	w--;
  1.1790 +    }
  1.1791 +}
  1.1792 +
  1.1793 +static void
  1.1794 +sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
  1.1795 +                            pixman_op_t              op,
  1.1796 +                            uint32_t *               pd,
  1.1797 +                            const uint32_t *         ps,
  1.1798 +                            const uint32_t *         pm,
  1.1799 +                            int                      w)
  1.1800 +{
  1.1801 +    uint32_t s, m, d;
  1.1802 +
  1.1803 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.1804 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1805 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1806 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1807 +
  1.1808 +    while (w && (uintptr_t)pd & 15)
  1.1809 +    {
  1.1810 +	s = *ps++;
  1.1811 +	m = *pm++;
  1.1812 +	d = *pd;
  1.1813 +
  1.1814 +	*pd++ = pack_1x128_32 (
  1.1815 +	    pix_multiply_1x128 (
  1.1816 +		unpack_32_1x128 (d),
  1.1817 +		pix_multiply_1x128 (unpack_32_1x128 (m),
  1.1818 +				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
  1.1819 +	w--;
  1.1820 +    }
  1.1821 +
  1.1822 +    while (w >= 4)
  1.1823 +    {
  1.1824 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.1825 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1826 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1827 +
  1.1828 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1829 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1830 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1831 +
  1.1832 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.1833 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1834 +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1.1835 +			    &xmm_alpha_lo, &xmm_alpha_hi,
  1.1836 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1837 +
  1.1838 +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1.1839 +			    &xmm_alpha_lo, &xmm_alpha_hi,
  1.1840 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1841 +
  1.1842 +	save_128_aligned (
  1.1843 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1844 +
  1.1845 +	ps += 4;
  1.1846 +	pd += 4;
  1.1847 +	pm += 4;
  1.1848 +	w -= 4;
  1.1849 +    }
  1.1850 +
  1.1851 +    while (w)
  1.1852 +    {
  1.1853 +	s = *ps++;
  1.1854 +	m = *pm++;
  1.1855 +	d = *pd;
  1.1856 +
  1.1857 +	*pd++ = pack_1x128_32 (
  1.1858 +	    pix_multiply_1x128 (
  1.1859 +		unpack_32_1x128 (d),
  1.1860 +		pix_multiply_1x128 (unpack_32_1x128 (m),
  1.1861 +				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
  1.1862 +	w--;
  1.1863 +    }
  1.1864 +}
  1.1865 +
  1.1866 +static void
  1.1867 +sse2_combine_out_ca (pixman_implementation_t *imp,
  1.1868 +                     pixman_op_t              op,
  1.1869 +                     uint32_t *               pd,
  1.1870 +                     const uint32_t *         ps,
  1.1871 +                     const uint32_t *         pm,
  1.1872 +                     int                      w)
  1.1873 +{
  1.1874 +    uint32_t s, m, d;
  1.1875 +
  1.1876 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.1877 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1878 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1879 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1880 +
  1.1881 +    while (w && (uintptr_t)pd & 15)
  1.1882 +    {
  1.1883 +	s = *ps++;
  1.1884 +	m = *pm++;
  1.1885 +	d = *pd;
  1.1886 +
  1.1887 +	*pd++ = pack_1x128_32 (
  1.1888 +	    pix_multiply_1x128 (
  1.1889 +		pix_multiply_1x128 (
  1.1890 +		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1.1891 +		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1.1892 +	w--;
  1.1893 +    }
  1.1894 +
  1.1895 +    while (w >= 4)
  1.1896 +    {
  1.1897 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.1898 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1899 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1900 +
  1.1901 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1902 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1903 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1904 +
  1.1905 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.1906 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1907 +	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
  1.1908 +		      &xmm_alpha_lo, &xmm_alpha_hi);
  1.1909 +
  1.1910 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.1911 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.1912 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1913 +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1.1914 +			    &xmm_alpha_lo, &xmm_alpha_hi,
  1.1915 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1916 +
  1.1917 +	save_128_aligned (
  1.1918 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1919 +
  1.1920 +	ps += 4;
  1.1921 +	pd += 4;
  1.1922 +	pm += 4;
  1.1923 +	w -= 4;
  1.1924 +    }
  1.1925 +
  1.1926 +    while (w)
  1.1927 +    {
  1.1928 +	s = *ps++;
  1.1929 +	m = *pm++;
  1.1930 +	d = *pd;
  1.1931 +
  1.1932 +	*pd++ = pack_1x128_32 (
  1.1933 +	    pix_multiply_1x128 (
  1.1934 +		pix_multiply_1x128 (
  1.1935 +		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
  1.1936 +		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
  1.1937 +
  1.1938 +	w--;
  1.1939 +    }
  1.1940 +}
  1.1941 +
  1.1942 +static void
  1.1943 +sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
  1.1944 +                             pixman_op_t              op,
  1.1945 +                             uint32_t *               pd,
  1.1946 +                             const uint32_t *         ps,
  1.1947 +                             const uint32_t *         pm,
  1.1948 +                             int                      w)
  1.1949 +{
  1.1950 +    uint32_t s, m, d;
  1.1951 +
  1.1952 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.1953 +    __m128i xmm_src_lo, xmm_src_hi;
  1.1954 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.1955 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.1956 +
  1.1957 +    while (w && (uintptr_t)pd & 15)
  1.1958 +    {
  1.1959 +	s = *ps++;
  1.1960 +	m = *pm++;
  1.1961 +	d = *pd;
  1.1962 +
  1.1963 +	*pd++ = pack_1x128_32 (
  1.1964 +	    pix_multiply_1x128 (
  1.1965 +		unpack_32_1x128 (d),
  1.1966 +		negate_1x128 (pix_multiply_1x128 (
  1.1967 +				 unpack_32_1x128 (m),
  1.1968 +				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
  1.1969 +	w--;
  1.1970 +    }
  1.1971 +
  1.1972 +    while (w >= 4)
  1.1973 +    {
  1.1974 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.1975 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.1976 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.1977 +
  1.1978 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.1979 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.1980 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.1981 +
  1.1982 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.1983 +			    &xmm_alpha_lo, &xmm_alpha_hi);
  1.1984 +
  1.1985 +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1.1986 +			    &xmm_alpha_lo, &xmm_alpha_hi,
  1.1987 +			    &xmm_mask_lo, &xmm_mask_hi);
  1.1988 +
  1.1989 +	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.1990 +		      &xmm_mask_lo, &xmm_mask_hi);
  1.1991 +
  1.1992 +	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1.1993 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.1994 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.1995 +
  1.1996 +	save_128_aligned (
  1.1997 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.1998 +
  1.1999 +	ps += 4;
  1.2000 +	pd += 4;
  1.2001 +	pm += 4;
  1.2002 +	w -= 4;
  1.2003 +    }
  1.2004 +
  1.2005 +    while (w)
  1.2006 +    {
  1.2007 +	s = *ps++;
  1.2008 +	m = *pm++;
  1.2009 +	d = *pd;
  1.2010 +
  1.2011 +	*pd++ = pack_1x128_32 (
  1.2012 +	    pix_multiply_1x128 (
  1.2013 +		unpack_32_1x128 (d),
  1.2014 +		negate_1x128 (pix_multiply_1x128 (
  1.2015 +				 unpack_32_1x128 (m),
  1.2016 +				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
  1.2017 +	w--;
  1.2018 +    }
  1.2019 +}
  1.2020 +
  1.2021 +static force_inline uint32_t
  1.2022 +core_combine_atop_ca_pixel_sse2 (uint32_t src,
  1.2023 +                                 uint32_t mask,
  1.2024 +                                 uint32_t dst)
  1.2025 +{
  1.2026 +    __m128i m = unpack_32_1x128 (mask);
  1.2027 +    __m128i s = unpack_32_1x128 (src);
  1.2028 +    __m128i d = unpack_32_1x128 (dst);
  1.2029 +    __m128i sa = expand_alpha_1x128 (s);
  1.2030 +    __m128i da = expand_alpha_1x128 (d);
  1.2031 +
  1.2032 +    s = pix_multiply_1x128 (s, m);
  1.2033 +    m = negate_1x128 (pix_multiply_1x128 (m, sa));
  1.2034 +
  1.2035 +    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
  1.2036 +}
  1.2037 +
  1.2038 +static void
  1.2039 +sse2_combine_atop_ca (pixman_implementation_t *imp,
  1.2040 +                      pixman_op_t              op,
  1.2041 +                      uint32_t *               pd,
  1.2042 +                      const uint32_t *         ps,
  1.2043 +                      const uint32_t *         pm,
  1.2044 +                      int                      w)
  1.2045 +{
  1.2046 +    uint32_t s, m, d;
  1.2047 +
  1.2048 +    __m128i xmm_src_lo, xmm_src_hi;
  1.2049 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.2050 +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1.2051 +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1.2052 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.2053 +
  1.2054 +    while (w && (uintptr_t)pd & 15)
  1.2055 +    {
  1.2056 +	s = *ps++;
  1.2057 +	m = *pm++;
  1.2058 +	d = *pd;
  1.2059 +
  1.2060 +	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
  1.2061 +	w--;
  1.2062 +    }
  1.2063 +
  1.2064 +    while (w >= 4)
  1.2065 +    {
  1.2066 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.2067 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.2068 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.2069 +
  1.2070 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.2071 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.2072 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.2073 +
  1.2074 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.2075 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.2076 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.2077 +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.2078 +
  1.2079 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.2080 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.2081 +			    &xmm_src_lo, &xmm_src_hi);
  1.2082 +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1.2083 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1.2084 +			    &xmm_mask_lo, &xmm_mask_hi);
  1.2085 +
  1.2086 +	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.2087 +
  1.2088 +	pix_add_multiply_2x128 (
  1.2089 +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
  1.2090 +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1.2091 +	    &xmm_dst_lo, &xmm_dst_hi);
  1.2092 +
  1.2093 +	save_128_aligned (
  1.2094 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.2095 +
  1.2096 +	ps += 4;
  1.2097 +	pd += 4;
  1.2098 +	pm += 4;
  1.2099 +	w -= 4;
  1.2100 +    }
  1.2101 +
  1.2102 +    while (w)
  1.2103 +    {
  1.2104 +	s = *ps++;
  1.2105 +	m = *pm++;
  1.2106 +	d = *pd;
  1.2107 +
  1.2108 +	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
  1.2109 +	w--;
  1.2110 +    }
  1.2111 +}
  1.2112 +
  1.2113 +static force_inline uint32_t
  1.2114 +core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
  1.2115 +                                         uint32_t mask,
  1.2116 +                                         uint32_t dst)
  1.2117 +{
  1.2118 +    __m128i m = unpack_32_1x128 (mask);
  1.2119 +    __m128i s = unpack_32_1x128 (src);
  1.2120 +    __m128i d = unpack_32_1x128 (dst);
  1.2121 +
  1.2122 +    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
  1.2123 +    __m128i sa = expand_alpha_1x128 (s);
  1.2124 +
  1.2125 +    s = pix_multiply_1x128 (s, m);
  1.2126 +    m = pix_multiply_1x128 (m, sa);
  1.2127 +
  1.2128 +    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
  1.2129 +}
  1.2130 +
  1.2131 +static void
  1.2132 +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
  1.2133 +                              pixman_op_t              op,
  1.2134 +                              uint32_t *               pd,
  1.2135 +                              const uint32_t *         ps,
  1.2136 +                              const uint32_t *         pm,
  1.2137 +                              int                      w)
  1.2138 +{
  1.2139 +    uint32_t s, m, d;
  1.2140 +
  1.2141 +    __m128i xmm_src_lo, xmm_src_hi;
  1.2142 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.2143 +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1.2144 +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1.2145 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.2146 +
  1.2147 +    while (w && (uintptr_t)pd & 15)
  1.2148 +    {
  1.2149 +	s = *ps++;
  1.2150 +	m = *pm++;
  1.2151 +	d = *pd;
  1.2152 +
  1.2153 +	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
  1.2154 +	w--;
  1.2155 +    }
  1.2156 +
  1.2157 +    while (w >= 4)
  1.2158 +    {
  1.2159 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.2160 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.2161 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.2162 +
  1.2163 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.2164 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.2165 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.2166 +
  1.2167 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.2168 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.2169 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.2170 +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.2171 +
  1.2172 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.2173 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.2174 +			    &xmm_src_lo, &xmm_src_hi);
  1.2175 +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1.2176 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1.2177 +			    &xmm_mask_lo, &xmm_mask_hi);
  1.2178 +
  1.2179 +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  1.2180 +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.2181 +
  1.2182 +	pix_add_multiply_2x128 (
  1.2183 +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
  1.2184 +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1.2185 +	    &xmm_dst_lo, &xmm_dst_hi);
  1.2186 +
  1.2187 +	save_128_aligned (
  1.2188 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.2189 +
  1.2190 +	ps += 4;
  1.2191 +	pd += 4;
  1.2192 +	pm += 4;
  1.2193 +	w -= 4;
  1.2194 +    }
  1.2195 +
  1.2196 +    while (w)
  1.2197 +    {
  1.2198 +	s = *ps++;
  1.2199 +	m = *pm++;
  1.2200 +	d = *pd;
  1.2201 +
  1.2202 +	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
  1.2203 +	w--;
  1.2204 +    }
  1.2205 +}
  1.2206 +
  1.2207 +static force_inline uint32_t
  1.2208 +core_combine_xor_ca_pixel_sse2 (uint32_t src,
  1.2209 +                                uint32_t mask,
  1.2210 +                                uint32_t dst)
  1.2211 +{
  1.2212 +    __m128i a = unpack_32_1x128 (mask);
  1.2213 +    __m128i s = unpack_32_1x128 (src);
  1.2214 +    __m128i d = unpack_32_1x128 (dst);
  1.2215 +
  1.2216 +    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
  1.2217 +				       a, expand_alpha_1x128 (s)));
  1.2218 +    __m128i dest      = pix_multiply_1x128 (s, a);
  1.2219 +    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
  1.2220 +
  1.2221 +    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
  1.2222 +                                                &alpha_dst,
  1.2223 +                                                &dest,
  1.2224 +                                                &alpha_src));
  1.2225 +}
  1.2226 +
  1.2227 +static void
  1.2228 +sse2_combine_xor_ca (pixman_implementation_t *imp,
  1.2229 +                     pixman_op_t              op,
  1.2230 +                     uint32_t *               pd,
  1.2231 +                     const uint32_t *         ps,
  1.2232 +                     const uint32_t *         pm,
  1.2233 +                     int                      w)
  1.2234 +{
  1.2235 +    uint32_t s, m, d;
  1.2236 +
  1.2237 +    __m128i xmm_src_lo, xmm_src_hi;
  1.2238 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.2239 +    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
  1.2240 +    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
  1.2241 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.2242 +
  1.2243 +    while (w && (uintptr_t)pd & 15)
  1.2244 +    {
  1.2245 +	s = *ps++;
  1.2246 +	m = *pm++;
  1.2247 +	d = *pd;
  1.2248 +
  1.2249 +	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
  1.2250 +	w--;
  1.2251 +    }
  1.2252 +
  1.2253 +    while (w >= 4)
  1.2254 +    {
  1.2255 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.2256 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.2257 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.2258 +
  1.2259 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.2260 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.2261 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.2262 +
  1.2263 +	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.2264 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
  1.2265 +	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
  1.2266 +			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.2267 +
  1.2268 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.2269 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.2270 +			    &xmm_src_lo, &xmm_src_hi);
  1.2271 +	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1.2272 +			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
  1.2273 +			    &xmm_mask_lo, &xmm_mask_hi);
  1.2274 +
  1.2275 +	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
  1.2276 +		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
  1.2277 +	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.2278 +		      &xmm_mask_lo, &xmm_mask_hi);
  1.2279 +
  1.2280 +	pix_add_multiply_2x128 (
  1.2281 +	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
  1.2282 +	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
  1.2283 +	    &xmm_dst_lo, &xmm_dst_hi);
  1.2284 +
  1.2285 +	save_128_aligned (
  1.2286 +	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.2287 +
  1.2288 +	ps += 4;
  1.2289 +	pd += 4;
  1.2290 +	pm += 4;
  1.2291 +	w -= 4;
  1.2292 +    }
  1.2293 +
  1.2294 +    while (w)
  1.2295 +    {
  1.2296 +	s = *ps++;
  1.2297 +	m = *pm++;
  1.2298 +	d = *pd;
  1.2299 +
  1.2300 +	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
  1.2301 +	w--;
  1.2302 +    }
  1.2303 +}
  1.2304 +
  1.2305 +static void
  1.2306 +sse2_combine_add_ca (pixman_implementation_t *imp,
  1.2307 +                     pixman_op_t              op,
  1.2308 +                     uint32_t *               pd,
  1.2309 +                     const uint32_t *         ps,
  1.2310 +                     const uint32_t *         pm,
  1.2311 +                     int                      w)
  1.2312 +{
  1.2313 +    uint32_t s, m, d;
  1.2314 +
  1.2315 +    __m128i xmm_src_lo, xmm_src_hi;
  1.2316 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.2317 +    __m128i xmm_mask_lo, xmm_mask_hi;
  1.2318 +
  1.2319 +    while (w && (uintptr_t)pd & 15)
  1.2320 +    {
  1.2321 +	s = *ps++;
  1.2322 +	m = *pm++;
  1.2323 +	d = *pd;
  1.2324 +
  1.2325 +	*pd++ = pack_1x128_32 (
  1.2326 +	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
  1.2327 +					       unpack_32_1x128 (m)),
  1.2328 +			   unpack_32_1x128 (d)));
  1.2329 +	w--;
  1.2330 +    }
  1.2331 +
  1.2332 +    while (w >= 4)
  1.2333 +    {
  1.2334 +	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
  1.2335 +	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
  1.2336 +	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
  1.2337 +
  1.2338 +	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.2339 +	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.2340 +	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.2341 +
  1.2342 +	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.2343 +			    &xmm_mask_lo, &xmm_mask_hi,
  1.2344 +			    &xmm_src_lo, &xmm_src_hi);
  1.2345 +
  1.2346 +	save_128_aligned (
  1.2347 +	    (__m128i*)pd, pack_2x128_128 (
  1.2348 +		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
  1.2349 +		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
  1.2350 +
  1.2351 +	ps += 4;
  1.2352 +	pd += 4;
  1.2353 +	pm += 4;
  1.2354 +	w -= 4;
  1.2355 +    }
  1.2356 +
  1.2357 +    while (w)
  1.2358 +    {
  1.2359 +	s = *ps++;
  1.2360 +	m = *pm++;
  1.2361 +	d = *pd;
  1.2362 +
  1.2363 +	*pd++ = pack_1x128_32 (
  1.2364 +	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
  1.2365 +					       unpack_32_1x128 (m)),
  1.2366 +			   unpack_32_1x128 (d)));
  1.2367 +	w--;
  1.2368 +    }
  1.2369 +}
  1.2370 +
  1.2371 +static force_inline __m128i
  1.2372 +create_mask_16_128 (uint16_t mask)
  1.2373 +{
  1.2374 +    return _mm_set1_epi16 (mask);
  1.2375 +}
  1.2376 +
  1.2377 +/* Work around a code generation bug in Sun Studio 12. */
  1.2378 +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
  1.2379 +# define create_mask_2x32_128(mask0, mask1)				\
  1.2380 +    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
  1.2381 +#else
  1.2382 +static force_inline __m128i
  1.2383 +create_mask_2x32_128 (uint32_t mask0,
  1.2384 +                      uint32_t mask1)
  1.2385 +{
  1.2386 +    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
  1.2387 +}
  1.2388 +#endif
  1.2389 +
  1.2390 +static void
  1.2391 +sse2_composite_over_n_8888 (pixman_implementation_t *imp,
  1.2392 +                            pixman_composite_info_t *info)
  1.2393 +{
  1.2394 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2395 +    uint32_t src;
  1.2396 +    uint32_t    *dst_line, *dst, d;
  1.2397 +    int32_t w;
  1.2398 +    int dst_stride;
  1.2399 +    __m128i xmm_src, xmm_alpha;
  1.2400 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.2401 +
  1.2402 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2403 +
  1.2404 +    if (src == 0)
  1.2405 +	return;
  1.2406 +
  1.2407 +    PIXMAN_IMAGE_GET_LINE (
  1.2408 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2409 +
  1.2410 +    xmm_src = expand_pixel_32_1x128 (src);
  1.2411 +    xmm_alpha = expand_alpha_1x128 (xmm_src);
  1.2412 +
  1.2413 +    while (height--)
  1.2414 +    {
  1.2415 +	dst = dst_line;
  1.2416 +
  1.2417 +	dst_line += dst_stride;
  1.2418 +	w = width;
  1.2419 +
  1.2420 +	while (w && (uintptr_t)dst & 15)
  1.2421 +	{
  1.2422 +	    d = *dst;
  1.2423 +	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
  1.2424 +						xmm_alpha,
  1.2425 +						unpack_32_1x128 (d)));
  1.2426 +	    w--;
  1.2427 +	}
  1.2428 +
  1.2429 +	while (w >= 4)
  1.2430 +	{
  1.2431 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.2432 +
  1.2433 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.2434 +
  1.2435 +	    over_2x128 (&xmm_src, &xmm_src,
  1.2436 +			&xmm_alpha, &xmm_alpha,
  1.2437 +			&xmm_dst_lo, &xmm_dst_hi);
  1.2438 +
  1.2439 +	    /* rebuid the 4 pixel data and save*/
  1.2440 +	    save_128_aligned (
  1.2441 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.2442 +
  1.2443 +	    w -= 4;
  1.2444 +	    dst += 4;
  1.2445 +	}
  1.2446 +
  1.2447 +	while (w)
  1.2448 +	{
  1.2449 +	    d = *dst;
  1.2450 +	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
  1.2451 +						xmm_alpha,
  1.2452 +						unpack_32_1x128 (d)));
  1.2453 +	    w--;
  1.2454 +	}
  1.2455 +
  1.2456 +    }
  1.2457 +}
  1.2458 +
  1.2459 +static void
  1.2460 +sse2_composite_over_n_0565 (pixman_implementation_t *imp,
  1.2461 +                            pixman_composite_info_t *info)
  1.2462 +{
  1.2463 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2464 +    uint32_t src;
  1.2465 +    uint16_t    *dst_line, *dst, d;
  1.2466 +    int32_t w;
  1.2467 +    int dst_stride;
  1.2468 +    __m128i xmm_src, xmm_alpha;
  1.2469 +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  1.2470 +
  1.2471 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2472 +
  1.2473 +    if (src == 0)
  1.2474 +	return;
  1.2475 +
  1.2476 +    PIXMAN_IMAGE_GET_LINE (
  1.2477 +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.2478 +
  1.2479 +    xmm_src = expand_pixel_32_1x128 (src);
  1.2480 +    xmm_alpha = expand_alpha_1x128 (xmm_src);
  1.2481 +
  1.2482 +    while (height--)
  1.2483 +    {
  1.2484 +	dst = dst_line;
  1.2485 +
  1.2486 +	dst_line += dst_stride;
  1.2487 +	w = width;
  1.2488 +
  1.2489 +	while (w && (uintptr_t)dst & 15)
  1.2490 +	{
  1.2491 +	    d = *dst;
  1.2492 +
  1.2493 +	    *dst++ = pack_565_32_16 (
  1.2494 +		pack_1x128_32 (over_1x128 (xmm_src,
  1.2495 +					   xmm_alpha,
  1.2496 +					   expand565_16_1x128 (d))));
  1.2497 +	    w--;
  1.2498 +	}
  1.2499 +
  1.2500 +	while (w >= 8)
  1.2501 +	{
  1.2502 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.2503 +
  1.2504 +	    unpack_565_128_4x128 (xmm_dst,
  1.2505 +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  1.2506 +
  1.2507 +	    over_2x128 (&xmm_src, &xmm_src,
  1.2508 +			&xmm_alpha, &xmm_alpha,
  1.2509 +			&xmm_dst0, &xmm_dst1);
  1.2510 +	    over_2x128 (&xmm_src, &xmm_src,
  1.2511 +			&xmm_alpha, &xmm_alpha,
  1.2512 +			&xmm_dst2, &xmm_dst3);
  1.2513 +
  1.2514 +	    xmm_dst = pack_565_4x128_128 (
  1.2515 +		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  1.2516 +
  1.2517 +	    save_128_aligned ((__m128i*)dst, xmm_dst);
  1.2518 +
  1.2519 +	    dst += 8;
  1.2520 +	    w -= 8;
  1.2521 +	}
  1.2522 +
  1.2523 +	while (w--)
  1.2524 +	{
  1.2525 +	    d = *dst;
  1.2526 +	    *dst++ = pack_565_32_16 (
  1.2527 +		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
  1.2528 +					   expand565_16_1x128 (d))));
  1.2529 +	}
  1.2530 +    }
  1.2531 +
  1.2532 +}
  1.2533 +
  1.2534 +static void
  1.2535 +sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
  1.2536 +				   pixman_composite_info_t *info)
  1.2537 +{
  1.2538 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2539 +    uint32_t src;
  1.2540 +    uint32_t    *dst_line, d;
  1.2541 +    uint32_t    *mask_line, m;
  1.2542 +    uint32_t pack_cmp;
  1.2543 +    int dst_stride, mask_stride;
  1.2544 +
  1.2545 +    __m128i xmm_src;
  1.2546 +    __m128i xmm_dst;
  1.2547 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.2548 +
  1.2549 +    __m128i mmx_src, mmx_mask, mmx_dest;
  1.2550 +
  1.2551 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2552 +
  1.2553 +    if (src == 0)
  1.2554 +	return;
  1.2555 +
  1.2556 +    PIXMAN_IMAGE_GET_LINE (
  1.2557 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2558 +    PIXMAN_IMAGE_GET_LINE (
  1.2559 +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  1.2560 +
  1.2561 +    xmm_src = _mm_unpacklo_epi8 (
  1.2562 +	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
  1.2563 +    mmx_src   = xmm_src;
  1.2564 +
  1.2565 +    while (height--)
  1.2566 +    {
  1.2567 +	int w = width;
  1.2568 +	const uint32_t *pm = (uint32_t *)mask_line;
  1.2569 +	uint32_t *pd = (uint32_t *)dst_line;
  1.2570 +
  1.2571 +	dst_line += dst_stride;
  1.2572 +	mask_line += mask_stride;
  1.2573 +
  1.2574 +	while (w && (uintptr_t)pd & 15)
  1.2575 +	{
  1.2576 +	    m = *pm++;
  1.2577 +
  1.2578 +	    if (m)
  1.2579 +	    {
  1.2580 +		d = *pd;
  1.2581 +
  1.2582 +		mmx_mask = unpack_32_1x128 (m);
  1.2583 +		mmx_dest = unpack_32_1x128 (d);
  1.2584 +
  1.2585 +		*pd = pack_1x128_32 (
  1.2586 +		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
  1.2587 +				   mmx_dest));
  1.2588 +	    }
  1.2589 +
  1.2590 +	    pd++;
  1.2591 +	    w--;
  1.2592 +	}
  1.2593 +
  1.2594 +	while (w >= 4)
  1.2595 +	{
  1.2596 +	    xmm_mask = load_128_unaligned ((__m128i*)pm);
  1.2597 +
  1.2598 +	    pack_cmp =
  1.2599 +		_mm_movemask_epi8 (
  1.2600 +		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  1.2601 +
  1.2602 +	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
  1.2603 +	    if (pack_cmp != 0xffff)
  1.2604 +	    {
  1.2605 +		xmm_dst = load_128_aligned ((__m128i*)pd);
  1.2606 +
  1.2607 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.2608 +
  1.2609 +		pix_multiply_2x128 (&xmm_src, &xmm_src,
  1.2610 +				    &xmm_mask_lo, &xmm_mask_hi,
  1.2611 +				    &xmm_mask_lo, &xmm_mask_hi);
  1.2612 +		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
  1.2613 +
  1.2614 +		save_128_aligned (
  1.2615 +		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
  1.2616 +	    }
  1.2617 +
  1.2618 +	    pd += 4;
  1.2619 +	    pm += 4;
  1.2620 +	    w -= 4;
  1.2621 +	}
  1.2622 +
  1.2623 +	while (w)
  1.2624 +	{
  1.2625 +	    m = *pm++;
  1.2626 +
  1.2627 +	    if (m)
  1.2628 +	    {
  1.2629 +		d = *pd;
  1.2630 +
  1.2631 +		mmx_mask = unpack_32_1x128 (m);
  1.2632 +		mmx_dest = unpack_32_1x128 (d);
  1.2633 +
  1.2634 +		*pd = pack_1x128_32 (
  1.2635 +		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
  1.2636 +				   mmx_dest));
  1.2637 +	    }
  1.2638 +
  1.2639 +	    pd++;
  1.2640 +	    w--;
  1.2641 +	}
  1.2642 +    }
  1.2643 +
  1.2644 +}
  1.2645 +
  1.2646 +static void
  1.2647 +sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
  1.2648 +                                    pixman_composite_info_t *info)
  1.2649 +{
  1.2650 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2651 +    uint32_t src;
  1.2652 +    uint32_t    *dst_line, d;
  1.2653 +    uint32_t    *mask_line, m;
  1.2654 +    uint32_t pack_cmp;
  1.2655 +    int dst_stride, mask_stride;
  1.2656 +
  1.2657 +    __m128i xmm_src, xmm_alpha;
  1.2658 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.2659 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.2660 +
  1.2661 +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  1.2662 +
  1.2663 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2664 +
  1.2665 +    if (src == 0)
  1.2666 +	return;
  1.2667 +
  1.2668 +    PIXMAN_IMAGE_GET_LINE (
  1.2669 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2670 +    PIXMAN_IMAGE_GET_LINE (
  1.2671 +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  1.2672 +
  1.2673 +    xmm_src = _mm_unpacklo_epi8 (
  1.2674 +	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
  1.2675 +    xmm_alpha = expand_alpha_1x128 (xmm_src);
  1.2676 +    mmx_src   = xmm_src;
  1.2677 +    mmx_alpha = xmm_alpha;
  1.2678 +
  1.2679 +    while (height--)
  1.2680 +    {
  1.2681 +	int w = width;
  1.2682 +	const uint32_t *pm = (uint32_t *)mask_line;
  1.2683 +	uint32_t *pd = (uint32_t *)dst_line;
  1.2684 +
  1.2685 +	dst_line += dst_stride;
  1.2686 +	mask_line += mask_stride;
  1.2687 +
  1.2688 +	while (w && (uintptr_t)pd & 15)
  1.2689 +	{
  1.2690 +	    m = *pm++;
  1.2691 +
  1.2692 +	    if (m)
  1.2693 +	    {
  1.2694 +		d = *pd;
  1.2695 +		mmx_mask = unpack_32_1x128 (m);
  1.2696 +		mmx_dest = unpack_32_1x128 (d);
  1.2697 +
  1.2698 +		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
  1.2699 +		                                  &mmx_alpha,
  1.2700 +		                                  &mmx_mask,
  1.2701 +		                                  &mmx_dest));
  1.2702 +	    }
  1.2703 +
  1.2704 +	    pd++;
  1.2705 +	    w--;
  1.2706 +	}
  1.2707 +
  1.2708 +	while (w >= 4)
  1.2709 +	{
  1.2710 +	    xmm_mask = load_128_unaligned ((__m128i*)pm);
  1.2711 +
  1.2712 +	    pack_cmp =
  1.2713 +		_mm_movemask_epi8 (
  1.2714 +		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  1.2715 +
  1.2716 +	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
  1.2717 +	    if (pack_cmp != 0xffff)
  1.2718 +	    {
  1.2719 +		xmm_dst = load_128_aligned ((__m128i*)pd);
  1.2720 +
  1.2721 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.2722 +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.2723 +
  1.2724 +		in_over_2x128 (&xmm_src, &xmm_src,
  1.2725 +			       &xmm_alpha, &xmm_alpha,
  1.2726 +			       &xmm_mask_lo, &xmm_mask_hi,
  1.2727 +			       &xmm_dst_lo, &xmm_dst_hi);
  1.2728 +
  1.2729 +		save_128_aligned (
  1.2730 +		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.2731 +	    }
  1.2732 +
  1.2733 +	    pd += 4;
  1.2734 +	    pm += 4;
  1.2735 +	    w -= 4;
  1.2736 +	}
  1.2737 +
  1.2738 +	while (w)
  1.2739 +	{
  1.2740 +	    m = *pm++;
  1.2741 +
  1.2742 +	    if (m)
  1.2743 +	    {
  1.2744 +		d = *pd;
  1.2745 +		mmx_mask = unpack_32_1x128 (m);
  1.2746 +		mmx_dest = unpack_32_1x128 (d);
  1.2747 +
  1.2748 +		*pd = pack_1x128_32 (
  1.2749 +		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
  1.2750 +	    }
  1.2751 +
  1.2752 +	    pd++;
  1.2753 +	    w--;
  1.2754 +	}
  1.2755 +    }
  1.2756 +
  1.2757 +}
  1.2758 +
  1.2759 +static void
  1.2760 +sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
  1.2761 +                                 pixman_composite_info_t *info)
  1.2762 +{
  1.2763 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2764 +    uint32_t    *dst_line, *dst;
  1.2765 +    uint32_t    *src_line, *src;
  1.2766 +    uint32_t mask;
  1.2767 +    int32_t w;
  1.2768 +    int dst_stride, src_stride;
  1.2769 +
  1.2770 +    __m128i xmm_mask;
  1.2771 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.2772 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.2773 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.2774 +
  1.2775 +    PIXMAN_IMAGE_GET_LINE (
  1.2776 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2777 +    PIXMAN_IMAGE_GET_LINE (
  1.2778 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2779 +
  1.2780 +    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
  1.2781 +
  1.2782 +    xmm_mask = create_mask_16_128 (mask >> 24);
  1.2783 +
  1.2784 +    while (height--)
  1.2785 +    {
  1.2786 +	dst = dst_line;
  1.2787 +	dst_line += dst_stride;
  1.2788 +	src = src_line;
  1.2789 +	src_line += src_stride;
  1.2790 +	w = width;
  1.2791 +
  1.2792 +	while (w && (uintptr_t)dst & 15)
  1.2793 +	{
  1.2794 +	    uint32_t s = *src++;
  1.2795 +
  1.2796 +	    if (s)
  1.2797 +	    {
  1.2798 +		uint32_t d = *dst;
  1.2799 +		
  1.2800 +		__m128i ms = unpack_32_1x128 (s);
  1.2801 +		__m128i alpha    = expand_alpha_1x128 (ms);
  1.2802 +		__m128i dest     = xmm_mask;
  1.2803 +		__m128i alpha_dst = unpack_32_1x128 (d);
  1.2804 +		
  1.2805 +		*dst = pack_1x128_32 (
  1.2806 +		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  1.2807 +	    }
  1.2808 +	    dst++;
  1.2809 +	    w--;
  1.2810 +	}
  1.2811 +
  1.2812 +	while (w >= 4)
  1.2813 +	{
  1.2814 +	    xmm_src = load_128_unaligned ((__m128i*)src);
  1.2815 +
  1.2816 +	    if (!is_zero (xmm_src))
  1.2817 +	    {
  1.2818 +		xmm_dst = load_128_aligned ((__m128i*)dst);
  1.2819 +		
  1.2820 +		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.2821 +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.2822 +		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.2823 +				    &xmm_alpha_lo, &xmm_alpha_hi);
  1.2824 +		
  1.2825 +		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.2826 +			       &xmm_alpha_lo, &xmm_alpha_hi,
  1.2827 +			       &xmm_mask, &xmm_mask,
  1.2828 +			       &xmm_dst_lo, &xmm_dst_hi);
  1.2829 +		
  1.2830 +		save_128_aligned (
  1.2831 +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.2832 +	    }
  1.2833 +		
  1.2834 +	    dst += 4;
  1.2835 +	    src += 4;
  1.2836 +	    w -= 4;
  1.2837 +	}
  1.2838 +
  1.2839 +	while (w)
  1.2840 +	{
  1.2841 +	    uint32_t s = *src++;
  1.2842 +
  1.2843 +	    if (s)
  1.2844 +	    {
  1.2845 +		uint32_t d = *dst;
  1.2846 +		
  1.2847 +		__m128i ms = unpack_32_1x128 (s);
  1.2848 +		__m128i alpha = expand_alpha_1x128 (ms);
  1.2849 +		__m128i mask  = xmm_mask;
  1.2850 +		__m128i dest  = unpack_32_1x128 (d);
  1.2851 +		
  1.2852 +		*dst = pack_1x128_32 (
  1.2853 +		    in_over_1x128 (&ms, &alpha, &mask, &dest));
  1.2854 +	    }
  1.2855 +
  1.2856 +	    dst++;
  1.2857 +	    w--;
  1.2858 +	}
  1.2859 +    }
  1.2860 +
  1.2861 +}
  1.2862 +
  1.2863 +static void
  1.2864 +sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
  1.2865 +                              pixman_composite_info_t *info)
  1.2866 +{
  1.2867 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2868 +    uint16_t    *dst_line, *dst;
  1.2869 +    uint32_t    *src_line, *src, s;
  1.2870 +    int dst_stride, src_stride;
  1.2871 +    int32_t w;
  1.2872 +
  1.2873 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2874 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.2875 +
  1.2876 +    while (height--)
  1.2877 +    {
  1.2878 +	dst = dst_line;
  1.2879 +	dst_line += dst_stride;
  1.2880 +	src = src_line;
  1.2881 +	src_line += src_stride;
  1.2882 +	w = width;
  1.2883 +
  1.2884 +	while (w && (uintptr_t)dst & 15)
  1.2885 +	{
  1.2886 +	    s = *src++;
  1.2887 +	    *dst = convert_8888_to_0565 (s);
  1.2888 +	    dst++;
  1.2889 +	    w--;
  1.2890 +	}
  1.2891 +
  1.2892 +	while (w >= 8)
  1.2893 +	{
  1.2894 +	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
  1.2895 +	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
  1.2896 +
  1.2897 +	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
  1.2898 +
  1.2899 +	    w -= 8;
  1.2900 +	    src += 8;
  1.2901 +	    dst += 8;
  1.2902 +	}
  1.2903 +
  1.2904 +	while (w)
  1.2905 +	{
  1.2906 +	    s = *src++;
  1.2907 +	    *dst = convert_8888_to_0565 (s);
  1.2908 +	    dst++;
  1.2909 +	    w--;
  1.2910 +	}
  1.2911 +    }
  1.2912 +}
  1.2913 +
  1.2914 +static void
  1.2915 +sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
  1.2916 +			      pixman_composite_info_t *info)
  1.2917 +{
  1.2918 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2919 +    uint32_t    *dst_line, *dst;
  1.2920 +    uint32_t    *src_line, *src;
  1.2921 +    int32_t w;
  1.2922 +    int dst_stride, src_stride;
  1.2923 +
  1.2924 +
  1.2925 +    PIXMAN_IMAGE_GET_LINE (
  1.2926 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2927 +    PIXMAN_IMAGE_GET_LINE (
  1.2928 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2929 +
  1.2930 +    while (height--)
  1.2931 +    {
  1.2932 +	dst = dst_line;
  1.2933 +	dst_line += dst_stride;
  1.2934 +	src = src_line;
  1.2935 +	src_line += src_stride;
  1.2936 +	w = width;
  1.2937 +
  1.2938 +	while (w && (uintptr_t)dst & 15)
  1.2939 +	{
  1.2940 +	    *dst++ = *src++ | 0xff000000;
  1.2941 +	    w--;
  1.2942 +	}
  1.2943 +
  1.2944 +	while (w >= 16)
  1.2945 +	{
  1.2946 +	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
  1.2947 +	    
  1.2948 +	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
  1.2949 +	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
  1.2950 +	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
  1.2951 +	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
  1.2952 +	    
  1.2953 +	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
  1.2954 +	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
  1.2955 +	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
  1.2956 +	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
  1.2957 +	    
  1.2958 +	    dst += 16;
  1.2959 +	    src += 16;
  1.2960 +	    w -= 16;
  1.2961 +	}
  1.2962 +
  1.2963 +	while (w)
  1.2964 +	{
  1.2965 +	    *dst++ = *src++ | 0xff000000;
  1.2966 +	    w--;
  1.2967 +	}
  1.2968 +    }
  1.2969 +
  1.2970 +}
  1.2971 +
  1.2972 +static void
  1.2973 +sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
  1.2974 +                                 pixman_composite_info_t *info)
  1.2975 +{
  1.2976 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2977 +    uint32_t    *dst_line, *dst;
  1.2978 +    uint32_t    *src_line, *src;
  1.2979 +    uint32_t mask;
  1.2980 +    int dst_stride, src_stride;
  1.2981 +    int32_t w;
  1.2982 +
  1.2983 +    __m128i xmm_mask, xmm_alpha;
  1.2984 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.2985 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.2986 +
  1.2987 +    PIXMAN_IMAGE_GET_LINE (
  1.2988 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2989 +    PIXMAN_IMAGE_GET_LINE (
  1.2990 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2991 +
  1.2992 +    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
  1.2993 +
  1.2994 +    xmm_mask = create_mask_16_128 (mask >> 24);
  1.2995 +    xmm_alpha = mask_00ff;
  1.2996 +
  1.2997 +    while (height--)
  1.2998 +    {
  1.2999 +	dst = dst_line;
  1.3000 +	dst_line += dst_stride;
  1.3001 +	src = src_line;
  1.3002 +	src_line += src_stride;
  1.3003 +	w = width;
  1.3004 +
  1.3005 +	while (w && (uintptr_t)dst & 15)
  1.3006 +	{
  1.3007 +	    uint32_t s = (*src++) | 0xff000000;
  1.3008 +	    uint32_t d = *dst;
  1.3009 +
  1.3010 +	    __m128i src   = unpack_32_1x128 (s);
  1.3011 +	    __m128i alpha = xmm_alpha;
  1.3012 +	    __m128i mask  = xmm_mask;
  1.3013 +	    __m128i dest  = unpack_32_1x128 (d);
  1.3014 +
  1.3015 +	    *dst++ = pack_1x128_32 (
  1.3016 +		in_over_1x128 (&src, &alpha, &mask, &dest));
  1.3017 +
  1.3018 +	    w--;
  1.3019 +	}
  1.3020 +
  1.3021 +	while (w >= 4)
  1.3022 +	{
  1.3023 +	    xmm_src = _mm_or_si128 (
  1.3024 +		load_128_unaligned ((__m128i*)src), mask_ff000000);
  1.3025 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.3026 +
  1.3027 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.3028 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.3029 +
  1.3030 +	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.3031 +			   &xmm_alpha, &xmm_alpha,
  1.3032 +			   &xmm_mask, &xmm_mask,
  1.3033 +			   &xmm_dst_lo, &xmm_dst_hi);
  1.3034 +
  1.3035 +	    save_128_aligned (
  1.3036 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.3037 +
  1.3038 +	    dst += 4;
  1.3039 +	    src += 4;
  1.3040 +	    w -= 4;
  1.3041 +
  1.3042 +	}
  1.3043 +
  1.3044 +	while (w)
  1.3045 +	{
  1.3046 +	    uint32_t s = (*src++) | 0xff000000;
  1.3047 +	    uint32_t d = *dst;
  1.3048 +
  1.3049 +	    __m128i src  = unpack_32_1x128 (s);
  1.3050 +	    __m128i alpha = xmm_alpha;
  1.3051 +	    __m128i mask  = xmm_mask;
  1.3052 +	    __m128i dest  = unpack_32_1x128 (d);
  1.3053 +
  1.3054 +	    *dst++ = pack_1x128_32 (
  1.3055 +		in_over_1x128 (&src, &alpha, &mask, &dest));
  1.3056 +
  1.3057 +	    w--;
  1.3058 +	}
  1.3059 +    }
  1.3060 +
  1.3061 +}
  1.3062 +
  1.3063 +static void
  1.3064 +sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
  1.3065 +                               pixman_composite_info_t *info)
  1.3066 +{
  1.3067 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3068 +    int dst_stride, src_stride;
  1.3069 +    uint32_t    *dst_line, *dst;
  1.3070 +    uint32_t    *src_line, *src;
  1.3071 +
  1.3072 +    PIXMAN_IMAGE_GET_LINE (
  1.3073 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3074 +    PIXMAN_IMAGE_GET_LINE (
  1.3075 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.3076 +
  1.3077 +    dst = dst_line;
  1.3078 +    src = src_line;
  1.3079 +
  1.3080 +    while (height--)
  1.3081 +    {
  1.3082 +	sse2_combine_over_u (imp, op, dst, src, NULL, width);
  1.3083 +
  1.3084 +	dst += dst_stride;
  1.3085 +	src += src_stride;
  1.3086 +    }
  1.3087 +}
  1.3088 +
  1.3089 +static force_inline uint16_t
  1.3090 +composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
  1.3091 +{
  1.3092 +    __m128i ms;
  1.3093 +
  1.3094 +    ms = unpack_32_1x128 (src);
  1.3095 +    return pack_565_32_16 (
  1.3096 +	pack_1x128_32 (
  1.3097 +	    over_1x128 (
  1.3098 +		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
  1.3099 +}
  1.3100 +
  1.3101 +static void
  1.3102 +sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
  1.3103 +                               pixman_composite_info_t *info)
  1.3104 +{
  1.3105 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3106 +    uint16_t    *dst_line, *dst, d;
  1.3107 +    uint32_t    *src_line, *src, s;
  1.3108 +    int dst_stride, src_stride;
  1.3109 +    int32_t w;
  1.3110 +
  1.3111 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.3112 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.3113 +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  1.3114 +
  1.3115 +    PIXMAN_IMAGE_GET_LINE (
  1.3116 +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.3117 +    PIXMAN_IMAGE_GET_LINE (
  1.3118 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.3119 +
  1.3120 +    while (height--)
  1.3121 +    {
  1.3122 +	dst = dst_line;
  1.3123 +	src = src_line;
  1.3124 +
  1.3125 +	dst_line += dst_stride;
  1.3126 +	src_line += src_stride;
  1.3127 +	w = width;
  1.3128 +
  1.3129 +	/* Align dst on a 16-byte boundary */
  1.3130 +	while (w &&
  1.3131 +	       ((uintptr_t)dst & 15))
  1.3132 +	{
  1.3133 +	    s = *src++;
  1.3134 +	    d = *dst;
  1.3135 +
  1.3136 +	    *dst++ = composite_over_8888_0565pixel (s, d);
  1.3137 +	    w--;
  1.3138 +	}
  1.3139 +
  1.3140 +	/* It's a 8 pixel loop */
  1.3141 +	while (w >= 8)
  1.3142 +	{
  1.3143 +	    /* I'm loading unaligned because I'm not sure
  1.3144 +	     * about the address alignment.
  1.3145 +	     */
  1.3146 +	    xmm_src = load_128_unaligned ((__m128i*) src);
  1.3147 +	    xmm_dst = load_128_aligned ((__m128i*) dst);
  1.3148 +
  1.3149 +	    /* Unpacking */
  1.3150 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.3151 +	    unpack_565_128_4x128 (xmm_dst,
  1.3152 +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  1.3153 +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.3154 +				&xmm_alpha_lo, &xmm_alpha_hi);
  1.3155 +
  1.3156 +	    /* I'm loading next 4 pixels from memory
  1.3157 +	     * before to optimze the memory read.
  1.3158 +	     */
  1.3159 +	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
  1.3160 +
  1.3161 +	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.3162 +			&xmm_alpha_lo, &xmm_alpha_hi,
  1.3163 +			&xmm_dst0, &xmm_dst1);
  1.3164 +
  1.3165 +	    /* Unpacking */
  1.3166 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.3167 +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.3168 +				&xmm_alpha_lo, &xmm_alpha_hi);
  1.3169 +
  1.3170 +	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.3171 +			&xmm_alpha_lo, &xmm_alpha_hi,
  1.3172 +			&xmm_dst2, &xmm_dst3);
  1.3173 +
  1.3174 +	    save_128_aligned (
  1.3175 +		(__m128i*)dst, pack_565_4x128_128 (
  1.3176 +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  1.3177 +
  1.3178 +	    w -= 8;
  1.3179 +	    dst += 8;
  1.3180 +	    src += 8;
  1.3181 +	}
  1.3182 +
  1.3183 +	while (w--)
  1.3184 +	{
  1.3185 +	    s = *src++;
  1.3186 +	    d = *dst;
  1.3187 +
  1.3188 +	    *dst++ = composite_over_8888_0565pixel (s, d);
  1.3189 +	}
  1.3190 +    }
  1.3191 +
  1.3192 +}
  1.3193 +
  1.3194 +static void
  1.3195 +sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
  1.3196 +                              pixman_composite_info_t *info)
  1.3197 +{
  1.3198 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3199 +    uint32_t src, srca;
  1.3200 +    uint32_t *dst_line, *dst;
  1.3201 +    uint8_t *mask_line, *mask;
  1.3202 +    int dst_stride, mask_stride;
  1.3203 +    int32_t w;
  1.3204 +    uint32_t m, d;
  1.3205 +
  1.3206 +    __m128i xmm_src, xmm_alpha, xmm_def;
  1.3207 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.3208 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.3209 +
  1.3210 +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  1.3211 +
  1.3212 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.3213 +
  1.3214 +    srca = src >> 24;
  1.3215 +    if (src == 0)
  1.3216 +	return;
  1.3217 +
  1.3218 +    PIXMAN_IMAGE_GET_LINE (
  1.3219 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3220 +    PIXMAN_IMAGE_GET_LINE (
  1.3221 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.3222 +
  1.3223 +    xmm_def = create_mask_2x32_128 (src, src);
  1.3224 +    xmm_src = expand_pixel_32_1x128 (src);
  1.3225 +    xmm_alpha = expand_alpha_1x128 (xmm_src);
  1.3226 +    mmx_src   = xmm_src;
  1.3227 +    mmx_alpha = xmm_alpha;
  1.3228 +
  1.3229 +    while (height--)
  1.3230 +    {
  1.3231 +	dst = dst_line;
  1.3232 +	dst_line += dst_stride;
  1.3233 +	mask = mask_line;
  1.3234 +	mask_line += mask_stride;
  1.3235 +	w = width;
  1.3236 +
  1.3237 +	while (w && (uintptr_t)dst & 15)
  1.3238 +	{
  1.3239 +	    uint8_t m = *mask++;
  1.3240 +
  1.3241 +	    if (m)
  1.3242 +	    {
  1.3243 +		d = *dst;
  1.3244 +		mmx_mask = expand_pixel_8_1x128 (m);
  1.3245 +		mmx_dest = unpack_32_1x128 (d);
  1.3246 +
  1.3247 +		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
  1.3248 +		                                   &mmx_alpha,
  1.3249 +		                                   &mmx_mask,
  1.3250 +		                                   &mmx_dest));
  1.3251 +	    }
  1.3252 +
  1.3253 +	    w--;
  1.3254 +	    dst++;
  1.3255 +	}
  1.3256 +
  1.3257 +	while (w >= 4)
  1.3258 +	{
  1.3259 +	    m = *((uint32_t*)mask);
  1.3260 +
  1.3261 +	    if (srca == 0xff && m == 0xffffffff)
  1.3262 +	    {
  1.3263 +		save_128_aligned ((__m128i*)dst, xmm_def);
  1.3264 +	    }
  1.3265 +	    else if (m)
  1.3266 +	    {
  1.3267 +		xmm_dst = load_128_aligned ((__m128i*) dst);
  1.3268 +		xmm_mask = unpack_32_1x128 (m);
  1.3269 +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  1.3270 +
  1.3271 +		/* Unpacking */
  1.3272 +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.3273 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.3274 +
  1.3275 +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.3276 +					&xmm_mask_lo, &xmm_mask_hi);
  1.3277 +
  1.3278 +		in_over_2x128 (&xmm_src, &xmm_src,
  1.3279 +			       &xmm_alpha, &xmm_alpha,
  1.3280 +			       &xmm_mask_lo, &xmm_mask_hi,
  1.3281 +			       &xmm_dst_lo, &xmm_dst_hi);
  1.3282 +
  1.3283 +		save_128_aligned (
  1.3284 +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.3285 +	    }
  1.3286 +
  1.3287 +	    w -= 4;
  1.3288 +	    dst += 4;
  1.3289 +	    mask += 4;
  1.3290 +	}
  1.3291 +
  1.3292 +	while (w)
  1.3293 +	{
  1.3294 +	    uint8_t m = *mask++;
  1.3295 +
  1.3296 +	    if (m)
  1.3297 +	    {
  1.3298 +		d = *dst;
  1.3299 +		mmx_mask = expand_pixel_8_1x128 (m);
  1.3300 +		mmx_dest = unpack_32_1x128 (d);
  1.3301 +
  1.3302 +		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
  1.3303 +		                                   &mmx_alpha,
  1.3304 +		                                   &mmx_mask,
  1.3305 +		                                   &mmx_dest));
  1.3306 +	    }
  1.3307 +
  1.3308 +	    w--;
  1.3309 +	    dst++;
  1.3310 +	}
  1.3311 +    }
  1.3312 +
  1.3313 +}
  1.3314 +
  1.3315 +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
  1.3316 +__attribute__((__force_align_arg_pointer__))
  1.3317 +#endif
  1.3318 +static pixman_bool_t
  1.3319 +sse2_fill (pixman_implementation_t *imp,
  1.3320 +           uint32_t *               bits,
  1.3321 +           int                      stride,
  1.3322 +           int                      bpp,
  1.3323 +           int                      x,
  1.3324 +           int                      y,
  1.3325 +           int                      width,
  1.3326 +           int                      height,
  1.3327 +           uint32_t		    filler)
  1.3328 +{
  1.3329 +    uint32_t byte_width;
  1.3330 +    uint8_t *byte_line;
  1.3331 +
  1.3332 +    __m128i xmm_def;
  1.3333 +
  1.3334 +    if (bpp == 8)
  1.3335 +    {
  1.3336 +	uint8_t b;
  1.3337 +	uint16_t w;
  1.3338 +
  1.3339 +	stride = stride * (int) sizeof (uint32_t) / 1;
  1.3340 +	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
  1.3341 +	byte_width = width;
  1.3342 +	stride *= 1;
  1.3343 +
  1.3344 +	b = filler & 0xff;
  1.3345 +	w = (b << 8) | b;
  1.3346 +	filler = (w << 16) | w;
  1.3347 +    }
  1.3348 +    else if (bpp == 16)
  1.3349 +    {
  1.3350 +	stride = stride * (int) sizeof (uint32_t) / 2;
  1.3351 +	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
  1.3352 +	byte_width = 2 * width;
  1.3353 +	stride *= 2;
  1.3354 +
  1.3355 +        filler = (filler & 0xffff) * 0x00010001;
  1.3356 +    }
  1.3357 +    else if (bpp == 32)
  1.3358 +    {
  1.3359 +	stride = stride * (int) sizeof (uint32_t) / 4;
  1.3360 +	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
  1.3361 +	byte_width = 4 * width;
  1.3362 +	stride *= 4;
  1.3363 +    }
  1.3364 +    else
  1.3365 +    {
  1.3366 +	return FALSE;
  1.3367 +    }
  1.3368 +
  1.3369 +    xmm_def = create_mask_2x32_128 (filler, filler);
  1.3370 +
  1.3371 +    while (height--)
  1.3372 +    {
  1.3373 +	int w;
  1.3374 +	uint8_t *d = byte_line;
  1.3375 +	byte_line += stride;
  1.3376 +	w = byte_width;
  1.3377 +
  1.3378 +	if (w >= 1 && ((uintptr_t)d & 1))
  1.3379 +	{
  1.3380 +	    *(uint8_t *)d = filler;
  1.3381 +	    w -= 1;
  1.3382 +	    d += 1;
  1.3383 +	}
  1.3384 +
  1.3385 +	while (w >= 2 && ((uintptr_t)d & 3))
  1.3386 +	{
  1.3387 +	    *(uint16_t *)d = filler;
  1.3388 +	    w -= 2;
  1.3389 +	    d += 2;
  1.3390 +	}
  1.3391 +
  1.3392 +	while (w >= 4 && ((uintptr_t)d & 15))
  1.3393 +	{
  1.3394 +	    *(uint32_t *)d = filler;
  1.3395 +
  1.3396 +	    w -= 4;
  1.3397 +	    d += 4;
  1.3398 +	}
  1.3399 +
  1.3400 +	while (w >= 128)
  1.3401 +	{
  1.3402 +	    save_128_aligned ((__m128i*)(d),     xmm_def);
  1.3403 +	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
  1.3404 +	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
  1.3405 +	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
  1.3406 +	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
  1.3407 +	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
  1.3408 +	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
  1.3409 +	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
  1.3410 +
  1.3411 +	    d += 128;
  1.3412 +	    w -= 128;
  1.3413 +	}
  1.3414 +
  1.3415 +	if (w >= 64)
  1.3416 +	{
  1.3417 +	    save_128_aligned ((__m128i*)(d),     xmm_def);
  1.3418 +	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
  1.3419 +	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
  1.3420 +	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
  1.3421 +
  1.3422 +	    d += 64;
  1.3423 +	    w -= 64;
  1.3424 +	}
  1.3425 +
  1.3426 +	if (w >= 32)
  1.3427 +	{
  1.3428 +	    save_128_aligned ((__m128i*)(d),     xmm_def);
  1.3429 +	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
  1.3430 +
  1.3431 +	    d += 32;
  1.3432 +	    w -= 32;
  1.3433 +	}
  1.3434 +
  1.3435 +	if (w >= 16)
  1.3436 +	{
  1.3437 +	    save_128_aligned ((__m128i*)(d),     xmm_def);
  1.3438 +
  1.3439 +	    d += 16;
  1.3440 +	    w -= 16;
  1.3441 +	}
  1.3442 +
  1.3443 +	while (w >= 4)
  1.3444 +	{
  1.3445 +	    *(uint32_t *)d = filler;
  1.3446 +
  1.3447 +	    w -= 4;
  1.3448 +	    d += 4;
  1.3449 +	}
  1.3450 +
  1.3451 +	if (w >= 2)
  1.3452 +	{
  1.3453 +	    *(uint16_t *)d = filler;
  1.3454 +	    w -= 2;
  1.3455 +	    d += 2;
  1.3456 +	}
  1.3457 +
  1.3458 +	if (w >= 1)
  1.3459 +	{
  1.3460 +	    *(uint8_t *)d = filler;
  1.3461 +	    w -= 1;
  1.3462 +	    d += 1;
  1.3463 +	}
  1.3464 +    }
  1.3465 +
  1.3466 +    return TRUE;
  1.3467 +}
  1.3468 +
  1.3469 +static void
  1.3470 +sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
  1.3471 +                             pixman_composite_info_t *info)
  1.3472 +{
  1.3473 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3474 +    uint32_t src, srca;
  1.3475 +    uint32_t    *dst_line, *dst;
  1.3476 +    uint8_t     *mask_line, *mask;
  1.3477 +    int dst_stride, mask_stride;
  1.3478 +    int32_t w;
  1.3479 +    uint32_t m;
  1.3480 +
  1.3481 +    __m128i xmm_src, xmm_def;
  1.3482 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.3483 +
  1.3484 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.3485 +
  1.3486 +    srca = src >> 24;
  1.3487 +    if (src == 0)
  1.3488 +    {
  1.3489 +	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
  1.3490 +		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
  1.3491 +		   dest_x, dest_y, width, height, 0);
  1.3492 +	return;
  1.3493 +    }
  1.3494 +
  1.3495 +    PIXMAN_IMAGE_GET_LINE (
  1.3496 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3497 +    PIXMAN_IMAGE_GET_LINE (
  1.3498 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.3499 +
  1.3500 +    xmm_def = create_mask_2x32_128 (src, src);
  1.3501 +    xmm_src = expand_pixel_32_1x128 (src);
  1.3502 +
  1.3503 +    while (height--)
  1.3504 +    {
  1.3505 +	dst = dst_line;
  1.3506 +	dst_line += dst_stride;
  1.3507 +	mask = mask_line;
  1.3508 +	mask_line += mask_stride;
  1.3509 +	w = width;
  1.3510 +
  1.3511 +	while (w && (uintptr_t)dst & 15)
  1.3512 +	{
  1.3513 +	    uint8_t m = *mask++;
  1.3514 +
  1.3515 +	    if (m)
  1.3516 +	    {
  1.3517 +		*dst = pack_1x128_32 (
  1.3518 +		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
  1.3519 +	    }
  1.3520 +	    else
  1.3521 +	    {
  1.3522 +		*dst = 0;
  1.3523 +	    }
  1.3524 +
  1.3525 +	    w--;
  1.3526 +	    dst++;
  1.3527 +	}
  1.3528 +
  1.3529 +	while (w >= 4)
  1.3530 +	{
  1.3531 +	    m = *((uint32_t*)mask);
  1.3532 +
  1.3533 +	    if (srca == 0xff && m == 0xffffffff)
  1.3534 +	    {
  1.3535 +		save_128_aligned ((__m128i*)dst, xmm_def);
  1.3536 +	    }
  1.3537 +	    else if (m)
  1.3538 +	    {
  1.3539 +		xmm_mask = unpack_32_1x128 (m);
  1.3540 +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  1.3541 +
  1.3542 +		/* Unpacking */
  1.3543 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.3544 +
  1.3545 +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.3546 +					&xmm_mask_lo, &xmm_mask_hi);
  1.3547 +
  1.3548 +		pix_multiply_2x128 (&xmm_src, &xmm_src,
  1.3549 +				    &xmm_mask_lo, &xmm_mask_hi,
  1.3550 +				    &xmm_mask_lo, &xmm_mask_hi);
  1.3551 +
  1.3552 +		save_128_aligned (
  1.3553 +		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
  1.3554 +	    }
  1.3555 +	    else
  1.3556 +	    {
  1.3557 +		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
  1.3558 +	    }
  1.3559 +
  1.3560 +	    w -= 4;
  1.3561 +	    dst += 4;
  1.3562 +	    mask += 4;
  1.3563 +	}
  1.3564 +
  1.3565 +	while (w)
  1.3566 +	{
  1.3567 +	    uint8_t m = *mask++;
  1.3568 +
  1.3569 +	    if (m)
  1.3570 +	    {
  1.3571 +		*dst = pack_1x128_32 (
  1.3572 +		    pix_multiply_1x128 (
  1.3573 +			xmm_src, expand_pixel_8_1x128 (m)));
  1.3574 +	    }
  1.3575 +	    else
  1.3576 +	    {
  1.3577 +		*dst = 0;
  1.3578 +	    }
  1.3579 +
  1.3580 +	    w--;
  1.3581 +	    dst++;
  1.3582 +	}
  1.3583 +    }
  1.3584 +
  1.3585 +}
  1.3586 +
  1.3587 +static void
  1.3588 +sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
  1.3589 +                              pixman_composite_info_t *info)
  1.3590 +{
  1.3591 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3592 +    uint32_t src;
  1.3593 +    uint16_t    *dst_line, *dst, d;
  1.3594 +    uint8_t     *mask_line, *mask;
  1.3595 +    int dst_stride, mask_stride;
  1.3596 +    int32_t w;
  1.3597 +    uint32_t m;
  1.3598 +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  1.3599 +
  1.3600 +    __m128i xmm_src, xmm_alpha;
  1.3601 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.3602 +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  1.3603 +
  1.3604 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.3605 +
  1.3606 +    if (src == 0)
  1.3607 +	return;
  1.3608 +
  1.3609 +    PIXMAN_IMAGE_GET_LINE (
  1.3610 +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.3611 +    PIXMAN_IMAGE_GET_LINE (
  1.3612 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.3613 +
  1.3614 +    xmm_src = expand_pixel_32_1x128 (src);
  1.3615 +    xmm_alpha = expand_alpha_1x128 (xmm_src);
  1.3616 +    mmx_src = xmm_src;
  1.3617 +    mmx_alpha = xmm_alpha;
  1.3618 +
  1.3619 +    while (height--)
  1.3620 +    {
  1.3621 +	dst = dst_line;
  1.3622 +	dst_line += dst_stride;
  1.3623 +	mask = mask_line;
  1.3624 +	mask_line += mask_stride;
  1.3625 +	w = width;
  1.3626 +
  1.3627 +	while (w && (uintptr_t)dst & 15)
  1.3628 +	{
  1.3629 +	    m = *mask++;
  1.3630 +
  1.3631 +	    if (m)
  1.3632 +	    {
  1.3633 +		d = *dst;
  1.3634 +		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  1.3635 +		mmx_dest = expand565_16_1x128 (d);
  1.3636 +
  1.3637 +		*dst = pack_565_32_16 (
  1.3638 +		    pack_1x128_32 (
  1.3639 +			in_over_1x128 (
  1.3640 +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  1.3641 +	    }
  1.3642 +
  1.3643 +	    w--;
  1.3644 +	    dst++;
  1.3645 +	}
  1.3646 +
  1.3647 +	while (w >= 8)
  1.3648 +	{
  1.3649 +	    xmm_dst = load_128_aligned ((__m128i*) dst);
  1.3650 +	    unpack_565_128_4x128 (xmm_dst,
  1.3651 +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  1.3652 +
  1.3653 +	    m = *((uint32_t*)mask);
  1.3654 +	    mask += 4;
  1.3655 +
  1.3656 +	    if (m)
  1.3657 +	    {
  1.3658 +		xmm_mask = unpack_32_1x128 (m);
  1.3659 +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  1.3660 +
  1.3661 +		/* Unpacking */
  1.3662 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.3663 +
  1.3664 +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.3665 +					&xmm_mask_lo, &xmm_mask_hi);
  1.3666 +
  1.3667 +		in_over_2x128 (&xmm_src, &xmm_src,
  1.3668 +			       &xmm_alpha, &xmm_alpha,
  1.3669 +			       &xmm_mask_lo, &xmm_mask_hi,
  1.3670 +			       &xmm_dst0, &xmm_dst1);
  1.3671 +	    }
  1.3672 +
  1.3673 +	    m = *((uint32_t*)mask);
  1.3674 +	    mask += 4;
  1.3675 +
  1.3676 +	    if (m)
  1.3677 +	    {
  1.3678 +		xmm_mask = unpack_32_1x128 (m);
  1.3679 +		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
  1.3680 +
  1.3681 +		/* Unpacking */
  1.3682 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.3683 +
  1.3684 +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.3685 +					&xmm_mask_lo, &xmm_mask_hi);
  1.3686 +		in_over_2x128 (&xmm_src, &xmm_src,
  1.3687 +			       &xmm_alpha, &xmm_alpha,
  1.3688 +			       &xmm_mask_lo, &xmm_mask_hi,
  1.3689 +			       &xmm_dst2, &xmm_dst3);
  1.3690 +	    }
  1.3691 +
  1.3692 +	    save_128_aligned (
  1.3693 +		(__m128i*)dst, pack_565_4x128_128 (
  1.3694 +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  1.3695 +
  1.3696 +	    w -= 8;
  1.3697 +	    dst += 8;
  1.3698 +	}
  1.3699 +
  1.3700 +	while (w)
  1.3701 +	{
  1.3702 +	    m = *mask++;
  1.3703 +
  1.3704 +	    if (m)
  1.3705 +	    {
  1.3706 +		d = *dst;
  1.3707 +		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  1.3708 +		mmx_dest = expand565_16_1x128 (d);
  1.3709 +
  1.3710 +		*dst = pack_565_32_16 (
  1.3711 +		    pack_1x128_32 (
  1.3712 +			in_over_1x128 (
  1.3713 +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  1.3714 +	    }
  1.3715 +
  1.3716 +	    w--;
  1.3717 +	    dst++;
  1.3718 +	}
  1.3719 +    }
  1.3720 +
  1.3721 +}
  1.3722 +
  1.3723 +static void
  1.3724 +sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
  1.3725 +                                 pixman_composite_info_t *info)
  1.3726 +{
  1.3727 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3728 +    uint16_t    *dst_line, *dst, d;
  1.3729 +    uint32_t    *src_line, *src, s;
  1.3730 +    int dst_stride, src_stride;
  1.3731 +    int32_t w;
  1.3732 +    uint32_t opaque, zero;
  1.3733 +
  1.3734 +    __m128i ms;
  1.3735 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.3736 +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  1.3737 +
  1.3738 +    PIXMAN_IMAGE_GET_LINE (
  1.3739 +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.3740 +    PIXMAN_IMAGE_GET_LINE (
  1.3741 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.3742 +
  1.3743 +    while (height--)
  1.3744 +    {
  1.3745 +	dst = dst_line;
  1.3746 +	dst_line += dst_stride;
  1.3747 +	src = src_line;
  1.3748 +	src_line += src_stride;
  1.3749 +	w = width;
  1.3750 +
  1.3751 +	while (w && (uintptr_t)dst & 15)
  1.3752 +	{
  1.3753 +	    s = *src++;
  1.3754 +	    d = *dst;
  1.3755 +
  1.3756 +	    ms = unpack_32_1x128 (s);
  1.3757 +
  1.3758 +	    *dst++ = pack_565_32_16 (
  1.3759 +		pack_1x128_32 (
  1.3760 +		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
  1.3761 +	    w--;
  1.3762 +	}
  1.3763 +
  1.3764 +	while (w >= 8)
  1.3765 +	{
  1.3766 +	    /* First round */
  1.3767 +	    xmm_src = load_128_unaligned ((__m128i*)src);
  1.3768 +	    xmm_dst = load_128_aligned  ((__m128i*)dst);
  1.3769 +
  1.3770 +	    opaque = is_opaque (xmm_src);
  1.3771 +	    zero = is_zero (xmm_src);
  1.3772 +
  1.3773 +	    unpack_565_128_4x128 (xmm_dst,
  1.3774 +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  1.3775 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.3776 +
  1.3777 +	    /* preload next round*/
  1.3778 +	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
  1.3779 +
  1.3780 +	    if (opaque)
  1.3781 +	    {
  1.3782 +		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
  1.3783 +				     &xmm_dst0, &xmm_dst1);
  1.3784 +	    }
  1.3785 +	    else if (!zero)
  1.3786 +	    {
  1.3787 +		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
  1.3788 +					&xmm_dst0, &xmm_dst1);
  1.3789 +	    }
  1.3790 +
  1.3791 +	    /* Second round */
  1.3792 +	    opaque = is_opaque (xmm_src);
  1.3793 +	    zero = is_zero (xmm_src);
  1.3794 +
  1.3795 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.3796 +
  1.3797 +	    if (opaque)
  1.3798 +	    {
  1.3799 +		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
  1.3800 +				     &xmm_dst2, &xmm_dst3);
  1.3801 +	    }
  1.3802 +	    else if (!zero)
  1.3803 +	    {
  1.3804 +		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
  1.3805 +					&xmm_dst2, &xmm_dst3);
  1.3806 +	    }
  1.3807 +
  1.3808 +	    save_128_aligned (
  1.3809 +		(__m128i*)dst, pack_565_4x128_128 (
  1.3810 +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  1.3811 +
  1.3812 +	    w -= 8;
  1.3813 +	    src += 8;
  1.3814 +	    dst += 8;
  1.3815 +	}
  1.3816 +
  1.3817 +	while (w)
  1.3818 +	{
  1.3819 +	    s = *src++;
  1.3820 +	    d = *dst;
  1.3821 +
  1.3822 +	    ms = unpack_32_1x128 (s);
  1.3823 +
  1.3824 +	    *dst++ = pack_565_32_16 (
  1.3825 +		pack_1x128_32 (
  1.3826 +		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
  1.3827 +	    w--;
  1.3828 +	}
  1.3829 +    }
  1.3830 +
  1.3831 +}
  1.3832 +
  1.3833 +static void
  1.3834 +sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
  1.3835 +                                 pixman_composite_info_t *info)
  1.3836 +{
  1.3837 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3838 +    uint32_t    *dst_line, *dst, d;
  1.3839 +    uint32_t    *src_line, *src, s;
  1.3840 +    int dst_stride, src_stride;
  1.3841 +    int32_t w;
  1.3842 +    uint32_t opaque, zero;
  1.3843 +
  1.3844 +    __m128i xmm_src_lo, xmm_src_hi;
  1.3845 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.3846 +
  1.3847 +    PIXMAN_IMAGE_GET_LINE (
  1.3848 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3849 +    PIXMAN_IMAGE_GET_LINE (
  1.3850 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.3851 +
  1.3852 +    while (height--)
  1.3853 +    {
  1.3854 +	dst = dst_line;
  1.3855 +	dst_line += dst_stride;
  1.3856 +	src = src_line;
  1.3857 +	src_line += src_stride;
  1.3858 +	w = width;
  1.3859 +
  1.3860 +	while (w && (uintptr_t)dst & 15)
  1.3861 +	{
  1.3862 +	    s = *src++;
  1.3863 +	    d = *dst;
  1.3864 +
  1.3865 +	    *dst++ = pack_1x128_32 (
  1.3866 +		over_rev_non_pre_1x128 (
  1.3867 +		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
  1.3868 +
  1.3869 +	    w--;
  1.3870 +	}
  1.3871 +
  1.3872 +	while (w >= 4)
  1.3873 +	{
  1.3874 +	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
  1.3875 +
  1.3876 +	    opaque = is_opaque (xmm_src_hi);
  1.3877 +	    zero = is_zero (xmm_src_hi);
  1.3878 +
  1.3879 +	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.3880 +
  1.3881 +	    if (opaque)
  1.3882 +	    {
  1.3883 +		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
  1.3884 +				     &xmm_dst_lo, &xmm_dst_hi);
  1.3885 +
  1.3886 +		save_128_aligned (
  1.3887 +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.3888 +	    }
  1.3889 +	    else if (!zero)
  1.3890 +	    {
  1.3891 +		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
  1.3892 +
  1.3893 +		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.3894 +
  1.3895 +		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
  1.3896 +					&xmm_dst_lo, &xmm_dst_hi);
  1.3897 +
  1.3898 +		save_128_aligned (
  1.3899 +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.3900 +	    }
  1.3901 +
  1.3902 +	    w -= 4;
  1.3903 +	    dst += 4;
  1.3904 +	    src += 4;
  1.3905 +	}
  1.3906 +
  1.3907 +	while (w)
  1.3908 +	{
  1.3909 +	    s = *src++;
  1.3910 +	    d = *dst;
  1.3911 +
  1.3912 +	    *dst++ = pack_1x128_32 (
  1.3913 +		over_rev_non_pre_1x128 (
  1.3914 +		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
  1.3915 +
  1.3916 +	    w--;
  1.3917 +	}
  1.3918 +    }
  1.3919 +
  1.3920 +}
  1.3921 +
  1.3922 +static void
  1.3923 +sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
  1.3924 +                                    pixman_composite_info_t *info)
  1.3925 +{
  1.3926 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3927 +    uint32_t src;
  1.3928 +    uint16_t    *dst_line, *dst, d;
  1.3929 +    uint32_t    *mask_line, *mask, m;
  1.3930 +    int dst_stride, mask_stride;
  1.3931 +    int w;
  1.3932 +    uint32_t pack_cmp;
  1.3933 +
  1.3934 +    __m128i xmm_src, xmm_alpha;
  1.3935 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.3936 +    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
  1.3937 +
  1.3938 +    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
  1.3939 +
  1.3940 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.3941 +
  1.3942 +    if (src == 0)
  1.3943 +	return;
  1.3944 +
  1.3945 +    PIXMAN_IMAGE_GET_LINE (
  1.3946 +	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.3947 +    PIXMAN_IMAGE_GET_LINE (
  1.3948 +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  1.3949 +
  1.3950 +    xmm_src = expand_pixel_32_1x128 (src);
  1.3951 +    xmm_alpha = expand_alpha_1x128 (xmm_src);
  1.3952 +    mmx_src = xmm_src;
  1.3953 +    mmx_alpha = xmm_alpha;
  1.3954 +
  1.3955 +    while (height--)
  1.3956 +    {
  1.3957 +	w = width;
  1.3958 +	mask = mask_line;
  1.3959 +	dst = dst_line;
  1.3960 +	mask_line += mask_stride;
  1.3961 +	dst_line += dst_stride;
  1.3962 +
  1.3963 +	while (w && ((uintptr_t)dst & 15))
  1.3964 +	{
  1.3965 +	    m = *(uint32_t *) mask;
  1.3966 +
  1.3967 +	    if (m)
  1.3968 +	    {
  1.3969 +		d = *dst;
  1.3970 +		mmx_mask = unpack_32_1x128 (m);
  1.3971 +		mmx_dest = expand565_16_1x128 (d);
  1.3972 +
  1.3973 +		*dst = pack_565_32_16 (
  1.3974 +		    pack_1x128_32 (
  1.3975 +			in_over_1x128 (
  1.3976 +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  1.3977 +	    }
  1.3978 +
  1.3979 +	    w--;
  1.3980 +	    dst++;
  1.3981 +	    mask++;
  1.3982 +	}
  1.3983 +
  1.3984 +	while (w >= 8)
  1.3985 +	{
  1.3986 +	    /* First round */
  1.3987 +	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  1.3988 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.3989 +
  1.3990 +	    pack_cmp = _mm_movemask_epi8 (
  1.3991 +		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  1.3992 +
  1.3993 +	    unpack_565_128_4x128 (xmm_dst,
  1.3994 +				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
  1.3995 +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.3996 +
  1.3997 +	    /* preload next round */
  1.3998 +	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
  1.3999 +
  1.4000 +	    /* preload next round */
  1.4001 +	    if (pack_cmp != 0xffff)
  1.4002 +	    {
  1.4003 +		in_over_2x128 (&xmm_src, &xmm_src,
  1.4004 +			       &xmm_alpha, &xmm_alpha,
  1.4005 +			       &xmm_mask_lo, &xmm_mask_hi,
  1.4006 +			       &xmm_dst0, &xmm_dst1);
  1.4007 +	    }
  1.4008 +
  1.4009 +	    /* Second round */
  1.4010 +	    pack_cmp = _mm_movemask_epi8 (
  1.4011 +		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
  1.4012 +
  1.4013 +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.4014 +
  1.4015 +	    if (pack_cmp != 0xffff)
  1.4016 +	    {
  1.4017 +		in_over_2x128 (&xmm_src, &xmm_src,
  1.4018 +			       &xmm_alpha, &xmm_alpha,
  1.4019 +			       &xmm_mask_lo, &xmm_mask_hi,
  1.4020 +			       &xmm_dst2, &xmm_dst3);
  1.4021 +	    }
  1.4022 +
  1.4023 +	    save_128_aligned (
  1.4024 +		(__m128i*)dst, pack_565_4x128_128 (
  1.4025 +		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
  1.4026 +
  1.4027 +	    w -= 8;
  1.4028 +	    dst += 8;
  1.4029 +	    mask += 8;
  1.4030 +	}
  1.4031 +
  1.4032 +	while (w)
  1.4033 +	{
  1.4034 +	    m = *(uint32_t *) mask;
  1.4035 +
  1.4036 +	    if (m)
  1.4037 +	    {
  1.4038 +		d = *dst;
  1.4039 +		mmx_mask = unpack_32_1x128 (m);
  1.4040 +		mmx_dest = expand565_16_1x128 (d);
  1.4041 +
  1.4042 +		*dst = pack_565_32_16 (
  1.4043 +		    pack_1x128_32 (
  1.4044 +			in_over_1x128 (
  1.4045 +			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
  1.4046 +	    }
  1.4047 +
  1.4048 +	    w--;
  1.4049 +	    dst++;
  1.4050 +	    mask++;
  1.4051 +	}
  1.4052 +    }
  1.4053 +
  1.4054 +}
  1.4055 +
  1.4056 +static void
  1.4057 +sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
  1.4058 +                         pixman_composite_info_t *info)
  1.4059 +{
  1.4060 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4061 +    uint8_t     *dst_line, *dst;
  1.4062 +    uint8_t     *mask_line, *mask;
  1.4063 +    int dst_stride, mask_stride;
  1.4064 +    uint32_t d, m;
  1.4065 +    uint32_t src;
  1.4066 +    int32_t w;
  1.4067 +
  1.4068 +    __m128i xmm_alpha;
  1.4069 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.4070 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.4071 +
  1.4072 +    PIXMAN_IMAGE_GET_LINE (
  1.4073 +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.4074 +    PIXMAN_IMAGE_GET_LINE (
  1.4075 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.4076 +
  1.4077 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.4078 +
  1.4079 +    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
  1.4080 +
  1.4081 +    while (height--)
  1.4082 +    {
  1.4083 +	dst = dst_line;
  1.4084 +	dst_line += dst_stride;
  1.4085 +	mask = mask_line;
  1.4086 +	mask_line += mask_stride;
  1.4087 +	w = width;
  1.4088 +
  1.4089 +	while (w && ((uintptr_t)dst & 15))
  1.4090 +	{
  1.4091 +	    m = (uint32_t) *mask++;
  1.4092 +	    d = (uint32_t) *dst;
  1.4093 +
  1.4094 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4095 +		pix_multiply_1x128 (
  1.4096 +		    pix_multiply_1x128 (xmm_alpha,
  1.4097 +				       unpack_32_1x128 (m)),
  1.4098 +		    unpack_32_1x128 (d)));
  1.4099 +	    w--;
  1.4100 +	}
  1.4101 +
  1.4102 +	while (w >= 16)
  1.4103 +	{
  1.4104 +	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  1.4105 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.4106 +
  1.4107 +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.4108 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.4109 +
  1.4110 +	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
  1.4111 +				&xmm_mask_lo, &xmm_mask_hi,
  1.4112 +				&xmm_mask_lo, &xmm_mask_hi);
  1.4113 +
  1.4114 +	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
  1.4115 +				&xmm_dst_lo, &xmm_dst_hi,
  1.4116 +				&xmm_dst_lo, &xmm_dst_hi);
  1.4117 +
  1.4118 +	    save_128_aligned (
  1.4119 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.4120 +
  1.4121 +	    mask += 16;
  1.4122 +	    dst += 16;
  1.4123 +	    w -= 16;
  1.4124 +	}
  1.4125 +
  1.4126 +	while (w)
  1.4127 +	{
  1.4128 +	    m = (uint32_t) *mask++;
  1.4129 +	    d = (uint32_t) *dst;
  1.4130 +
  1.4131 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4132 +		pix_multiply_1x128 (
  1.4133 +		    pix_multiply_1x128 (
  1.4134 +			xmm_alpha, unpack_32_1x128 (m)),
  1.4135 +		    unpack_32_1x128 (d)));
  1.4136 +	    w--;
  1.4137 +	}
  1.4138 +    }
  1.4139 +
  1.4140 +}
  1.4141 +
  1.4142 +static void
  1.4143 +sse2_composite_in_n_8 (pixman_implementation_t *imp,
  1.4144 +		       pixman_composite_info_t *info)
  1.4145 +{
  1.4146 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4147 +    uint8_t     *dst_line, *dst;
  1.4148 +    int dst_stride;
  1.4149 +    uint32_t d;
  1.4150 +    uint32_t src;
  1.4151 +    int32_t w;
  1.4152 +
  1.4153 +    __m128i xmm_alpha;
  1.4154 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.4155 +
  1.4156 +    PIXMAN_IMAGE_GET_LINE (
  1.4157 +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.4158 +
  1.4159 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.4160 +
  1.4161 +    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
  1.4162 +
  1.4163 +    src = src >> 24;
  1.4164 +
  1.4165 +    if (src == 0xff)
  1.4166 +	return;
  1.4167 +
  1.4168 +    if (src == 0x00)
  1.4169 +    {
  1.4170 +	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
  1.4171 +		     8, dest_x, dest_y, width, height, src);
  1.4172 +
  1.4173 +	return;
  1.4174 +    }
  1.4175 +
  1.4176 +    while (height--)
  1.4177 +    {
  1.4178 +	dst = dst_line;
  1.4179 +	dst_line += dst_stride;
  1.4180 +	w = width;
  1.4181 +
  1.4182 +	while (w && ((uintptr_t)dst & 15))
  1.4183 +	{
  1.4184 +	    d = (uint32_t) *dst;
  1.4185 +
  1.4186 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4187 +		pix_multiply_1x128 (
  1.4188 +		    xmm_alpha,
  1.4189 +		    unpack_32_1x128 (d)));
  1.4190 +	    w--;
  1.4191 +	}
  1.4192 +
  1.4193 +	while (w >= 16)
  1.4194 +	{
  1.4195 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.4196 +
  1.4197 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.4198 +	    
  1.4199 +	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
  1.4200 +				&xmm_dst_lo, &xmm_dst_hi,
  1.4201 +				&xmm_dst_lo, &xmm_dst_hi);
  1.4202 +
  1.4203 +	    save_128_aligned (
  1.4204 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.4205 +
  1.4206 +	    dst += 16;
  1.4207 +	    w -= 16;
  1.4208 +	}
  1.4209 +
  1.4210 +	while (w)
  1.4211 +	{
  1.4212 +	    d = (uint32_t) *dst;
  1.4213 +
  1.4214 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4215 +		pix_multiply_1x128 (
  1.4216 +		    xmm_alpha,
  1.4217 +		    unpack_32_1x128 (d)));
  1.4218 +	    w--;
  1.4219 +	}
  1.4220 +    }
  1.4221 +
  1.4222 +}
  1.4223 +
  1.4224 +static void
  1.4225 +sse2_composite_in_8_8 (pixman_implementation_t *imp,
  1.4226 +                       pixman_composite_info_t *info)
  1.4227 +{
  1.4228 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4229 +    uint8_t     *dst_line, *dst;
  1.4230 +    uint8_t     *src_line, *src;
  1.4231 +    int src_stride, dst_stride;
  1.4232 +    int32_t w;
  1.4233 +    uint32_t s, d;
  1.4234 +
  1.4235 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.4236 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.4237 +
  1.4238 +    PIXMAN_IMAGE_GET_LINE (
  1.4239 +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.4240 +    PIXMAN_IMAGE_GET_LINE (
  1.4241 +	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  1.4242 +
  1.4243 +    while (height--)
  1.4244 +    {
  1.4245 +	dst = dst_line;
  1.4246 +	dst_line += dst_stride;
  1.4247 +	src = src_line;
  1.4248 +	src_line += src_stride;
  1.4249 +	w = width;
  1.4250 +
  1.4251 +	while (w && ((uintptr_t)dst & 15))
  1.4252 +	{
  1.4253 +	    s = (uint32_t) *src++;
  1.4254 +	    d = (uint32_t) *dst;
  1.4255 +
  1.4256 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4257 +		pix_multiply_1x128 (
  1.4258 +		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
  1.4259 +	    w--;
  1.4260 +	}
  1.4261 +
  1.4262 +	while (w >= 16)
  1.4263 +	{
  1.4264 +	    xmm_src = load_128_unaligned ((__m128i*)src);
  1.4265 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.4266 +
  1.4267 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.4268 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.4269 +
  1.4270 +	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.4271 +				&xmm_dst_lo, &xmm_dst_hi,
  1.4272 +				&xmm_dst_lo, &xmm_dst_hi);
  1.4273 +
  1.4274 +	    save_128_aligned (
  1.4275 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.4276 +
  1.4277 +	    src += 16;
  1.4278 +	    dst += 16;
  1.4279 +	    w -= 16;
  1.4280 +	}
  1.4281 +
  1.4282 +	while (w)
  1.4283 +	{
  1.4284 +	    s = (uint32_t) *src++;
  1.4285 +	    d = (uint32_t) *dst;
  1.4286 +
  1.4287 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4288 +		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
  1.4289 +	    w--;
  1.4290 +	}
  1.4291 +    }
  1.4292 +
  1.4293 +}
  1.4294 +
  1.4295 +static void
  1.4296 +sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
  1.4297 +			  pixman_composite_info_t *info)
  1.4298 +{
  1.4299 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4300 +    uint8_t     *dst_line, *dst;
  1.4301 +    uint8_t     *mask_line, *mask;
  1.4302 +    int dst_stride, mask_stride;
  1.4303 +    int32_t w;
  1.4304 +    uint32_t src;
  1.4305 +    uint32_t m, d;
  1.4306 +
  1.4307 +    __m128i xmm_alpha;
  1.4308 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.4309 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.4310 +
  1.4311 +    PIXMAN_IMAGE_GET_LINE (
  1.4312 +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.4313 +    PIXMAN_IMAGE_GET_LINE (
  1.4314 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.4315 +
  1.4316 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.4317 +
  1.4318 +    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
  1.4319 +
  1.4320 +    while (height--)
  1.4321 +    {
  1.4322 +	dst = dst_line;
  1.4323 +	dst_line += dst_stride;
  1.4324 +	mask = mask_line;
  1.4325 +	mask_line += mask_stride;
  1.4326 +	w = width;
  1.4327 +
  1.4328 +	while (w && ((uintptr_t)dst & 15))
  1.4329 +	{
  1.4330 +	    m = (uint32_t) *mask++;
  1.4331 +	    d = (uint32_t) *dst;
  1.4332 +
  1.4333 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4334 +		_mm_adds_epu16 (
  1.4335 +		    pix_multiply_1x128 (
  1.4336 +			xmm_alpha, unpack_32_1x128 (m)),
  1.4337 +		    unpack_32_1x128 (d)));
  1.4338 +	    w--;
  1.4339 +	}
  1.4340 +
  1.4341 +	while (w >= 16)
  1.4342 +	{
  1.4343 +	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  1.4344 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.4345 +
  1.4346 +	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.4347 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.4348 +
  1.4349 +	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
  1.4350 +				&xmm_mask_lo, &xmm_mask_hi,
  1.4351 +				&xmm_mask_lo, &xmm_mask_hi);
  1.4352 +
  1.4353 +	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
  1.4354 +	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
  1.4355 +
  1.4356 +	    save_128_aligned (
  1.4357 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.4358 +
  1.4359 +	    mask += 16;
  1.4360 +	    dst += 16;
  1.4361 +	    w -= 16;
  1.4362 +	}
  1.4363 +
  1.4364 +	while (w)
  1.4365 +	{
  1.4366 +	    m = (uint32_t) *mask++;
  1.4367 +	    d = (uint32_t) *dst;
  1.4368 +
  1.4369 +	    *dst++ = (uint8_t) pack_1x128_32 (
  1.4370 +		_mm_adds_epu16 (
  1.4371 +		    pix_multiply_1x128 (
  1.4372 +			xmm_alpha, unpack_32_1x128 (m)),
  1.4373 +		    unpack_32_1x128 (d)));
  1.4374 +
  1.4375 +	    w--;
  1.4376 +	}
  1.4377 +    }
  1.4378 +
  1.4379 +}
  1.4380 +
  1.4381 +static void
  1.4382 +sse2_composite_add_n_8 (pixman_implementation_t *imp,
  1.4383 +			pixman_composite_info_t *info)
  1.4384 +{
  1.4385 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4386 +    uint8_t     *dst_line, *dst;
  1.4387 +    int dst_stride;
  1.4388 +    int32_t w;
  1.4389 +    uint32_t src;
  1.4390 +
  1.4391 +    __m128i xmm_src;
  1.4392 +
  1.4393 +    PIXMAN_IMAGE_GET_LINE (
  1.4394 +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.4395 +
  1.4396 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.4397 +
  1.4398 +    src >>= 24;
  1.4399 +
  1.4400 +    if (src == 0x00)
  1.4401 +	return;
  1.4402 +
  1.4403 +    if (src == 0xff)
  1.4404 +    {
  1.4405 +	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
  1.4406 +		     8, dest_x, dest_y, width, height, 0xff);
  1.4407 +
  1.4408 +	return;
  1.4409 +    }
  1.4410 +
  1.4411 +    src = (src << 24) | (src << 16) | (src << 8) | src;
  1.4412 +    xmm_src = _mm_set_epi32 (src, src, src, src);
  1.4413 +
  1.4414 +    while (height--)
  1.4415 +    {
  1.4416 +	dst = dst_line;
  1.4417 +	dst_line += dst_stride;
  1.4418 +	w = width;
  1.4419 +
  1.4420 +	while (w && ((uintptr_t)dst & 15))
  1.4421 +	{
  1.4422 +	    *dst = (uint8_t)_mm_cvtsi128_si32 (
  1.4423 +		_mm_adds_epu8 (
  1.4424 +		    xmm_src,
  1.4425 +		    _mm_cvtsi32_si128 (*dst)));
  1.4426 +
  1.4427 +	    w--;
  1.4428 +	    dst++;
  1.4429 +	}
  1.4430 +
  1.4431 +	while (w >= 16)
  1.4432 +	{
  1.4433 +	    save_128_aligned (
  1.4434 +		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
  1.4435 +
  1.4436 +	    dst += 16;
  1.4437 +	    w -= 16;
  1.4438 +	}
  1.4439 +
  1.4440 +	while (w)
  1.4441 +	{
  1.4442 +	    *dst = (uint8_t)_mm_cvtsi128_si32 (
  1.4443 +		_mm_adds_epu8 (
  1.4444 +		    xmm_src,
  1.4445 +		    _mm_cvtsi32_si128 (*dst)));
  1.4446 +
  1.4447 +	    w--;
  1.4448 +	    dst++;
  1.4449 +	}
  1.4450 +    }
  1.4451 +
  1.4452 +}
  1.4453 +
  1.4454 +static void
  1.4455 +sse2_composite_add_8_8 (pixman_implementation_t *imp,
  1.4456 +			pixman_composite_info_t *info)
  1.4457 +{
  1.4458 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4459 +    uint8_t     *dst_line, *dst;
  1.4460 +    uint8_t     *src_line, *src;
  1.4461 +    int dst_stride, src_stride;
  1.4462 +    int32_t w;
  1.4463 +    uint16_t t;
  1.4464 +
  1.4465 +    PIXMAN_IMAGE_GET_LINE (
  1.4466 +	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  1.4467 +    PIXMAN_IMAGE_GET_LINE (
  1.4468 +	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.4469 +
  1.4470 +    while (height--)
  1.4471 +    {
  1.4472 +	dst = dst_line;
  1.4473 +	src = src_line;
  1.4474 +
  1.4475 +	dst_line += dst_stride;
  1.4476 +	src_line += src_stride;
  1.4477 +	w = width;
  1.4478 +
  1.4479 +	/* Small head */
  1.4480 +	while (w && (uintptr_t)dst & 3)
  1.4481 +	{
  1.4482 +	    t = (*dst) + (*src++);
  1.4483 +	    *dst++ = t | (0 - (t >> 8));
  1.4484 +	    w--;
  1.4485 +	}
  1.4486 +
  1.4487 +	sse2_combine_add_u (imp, op,
  1.4488 +			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
  1.4489 +
  1.4490 +	/* Small tail */
  1.4491 +	dst += w & 0xfffc;
  1.4492 +	src += w & 0xfffc;
  1.4493 +
  1.4494 +	w &= 3;
  1.4495 +
  1.4496 +	while (w)
  1.4497 +	{
  1.4498 +	    t = (*dst) + (*src++);
  1.4499 +	    *dst++ = t | (0 - (t >> 8));
  1.4500 +	    w--;
  1.4501 +	}
  1.4502 +    }
  1.4503 +
  1.4504 +}
  1.4505 +
  1.4506 +static void
  1.4507 +sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
  1.4508 +                              pixman_composite_info_t *info)
  1.4509 +{
  1.4510 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4511 +    uint32_t    *dst_line, *dst;
  1.4512 +    uint32_t    *src_line, *src;
  1.4513 +    int dst_stride, src_stride;
  1.4514 +
  1.4515 +    PIXMAN_IMAGE_GET_LINE (
  1.4516 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.4517 +    PIXMAN_IMAGE_GET_LINE (
  1.4518 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.4519 +
  1.4520 +    while (height--)
  1.4521 +    {
  1.4522 +	dst = dst_line;
  1.4523 +	dst_line += dst_stride;
  1.4524 +	src = src_line;
  1.4525 +	src_line += src_stride;
  1.4526 +
  1.4527 +	sse2_combine_add_u (imp, op, dst, src, NULL, width);
  1.4528 +    }
  1.4529 +}
  1.4530 +
  1.4531 +static void
  1.4532 +sse2_composite_add_n_8888 (pixman_implementation_t *imp,
  1.4533 +			   pixman_composite_info_t *info)
  1.4534 +{
  1.4535 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4536 +    uint32_t *dst_line, *dst, src;
  1.4537 +    int dst_stride;
  1.4538 +
  1.4539 +    __m128i xmm_src;
  1.4540 +
  1.4541 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.4542 +
  1.4543 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.4544 +    if (src == 0)
  1.4545 +	return;
  1.4546 +
  1.4547 +    if (src == ~0)
  1.4548 +    {
  1.4549 +	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
  1.4550 +		     dest_x, dest_y, width, height, ~0);
  1.4551 +
  1.4552 +	return;
  1.4553 +    }
  1.4554 +
  1.4555 +    xmm_src = _mm_set_epi32 (src, src, src, src);
  1.4556 +    while (height--)
  1.4557 +    {
  1.4558 +	int w = width;
  1.4559 +	uint32_t d;
  1.4560 +
  1.4561 +	dst = dst_line;
  1.4562 +	dst_line += dst_stride;
  1.4563 +
  1.4564 +	while (w && (unsigned long)dst & 15)
  1.4565 +	{
  1.4566 +	    d = *dst;
  1.4567 +	    *dst++ =
  1.4568 +		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
  1.4569 +	    w--;
  1.4570 +	}
  1.4571 +
  1.4572 +	while (w >= 4)
  1.4573 +	{
  1.4574 +	    save_128_aligned
  1.4575 +		((__m128i*)dst,
  1.4576 +		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
  1.4577 +
  1.4578 +	    dst += 4;
  1.4579 +	    w -= 4;
  1.4580 +	}
  1.4581 +
  1.4582 +	while (w--)
  1.4583 +	{
  1.4584 +	    d = *dst;
  1.4585 +	    *dst++ =
  1.4586 +		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
  1.4587 +						  _mm_cvtsi32_si128 (d)));
  1.4588 +	}
  1.4589 +    }
  1.4590 +}
  1.4591 +
  1.4592 +static void
  1.4593 +sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
  1.4594 +			     pixman_composite_info_t *info)
  1.4595 +{
  1.4596 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4597 +    uint32_t     *dst_line, *dst;
  1.4598 +    uint8_t     *mask_line, *mask;
  1.4599 +    int dst_stride, mask_stride;
  1.4600 +    int32_t w;
  1.4601 +    uint32_t src;
  1.4602 +
  1.4603 +    __m128i xmm_src;
  1.4604 +
  1.4605 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.4606 +    if (src == 0)
  1.4607 +	return;
  1.4608 +    xmm_src = expand_pixel_32_1x128 (src);
  1.4609 +
  1.4610 +    PIXMAN_IMAGE_GET_LINE (
  1.4611 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.4612 +    PIXMAN_IMAGE_GET_LINE (
  1.4613 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.4614 +
  1.4615 +    while (height--)
  1.4616 +    {
  1.4617 +	dst = dst_line;
  1.4618 +	dst_line += dst_stride;
  1.4619 +	mask = mask_line;
  1.4620 +	mask_line += mask_stride;
  1.4621 +	w = width;
  1.4622 +
  1.4623 +	while (w && ((unsigned long)dst & 15))
  1.4624 +	{
  1.4625 +	    uint8_t m = *mask++;
  1.4626 +	    if (m)
  1.4627 +	    {
  1.4628 +		*dst = pack_1x128_32
  1.4629 +		    (_mm_adds_epu16
  1.4630 +		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
  1.4631 +		      unpack_32_1x128 (*dst)));
  1.4632 +	    }
  1.4633 +	    dst++;
  1.4634 +	    w--;
  1.4635 +	}
  1.4636 +
  1.4637 +	while (w >= 4)
  1.4638 +	{
  1.4639 +	    uint32_t m = *(uint32_t*)mask;
  1.4640 +	    if (m)
  1.4641 +	    {
  1.4642 +		__m128i xmm_mask_lo, xmm_mask_hi;
  1.4643 +		__m128i xmm_dst_lo, xmm_dst_hi;
  1.4644 +
  1.4645 +		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
  1.4646 +		__m128i xmm_mask =
  1.4647 +		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
  1.4648 +				       _mm_setzero_si128 ());
  1.4649 +
  1.4650 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.4651 +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.4652 +
  1.4653 +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
  1.4654 +					&xmm_mask_lo, &xmm_mask_hi);
  1.4655 +
  1.4656 +		pix_multiply_2x128 (&xmm_src, &xmm_src,
  1.4657 +				    &xmm_mask_lo, &xmm_mask_hi,
  1.4658 +				    &xmm_mask_lo, &xmm_mask_hi);
  1.4659 +
  1.4660 +		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
  1.4661 +		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
  1.4662 +
  1.4663 +		save_128_aligned (
  1.4664 +		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.4665 +	    }
  1.4666 +
  1.4667 +	    w -= 4;
  1.4668 +	    dst += 4;
  1.4669 +	    mask += 4;
  1.4670 +	}
  1.4671 +
  1.4672 +	while (w)
  1.4673 +	{
  1.4674 +	    uint8_t m = *mask++;
  1.4675 +	    if (m)
  1.4676 +	    {
  1.4677 +		*dst = pack_1x128_32
  1.4678 +		    (_mm_adds_epu16
  1.4679 +		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
  1.4680 +		      unpack_32_1x128 (*dst)));
  1.4681 +	    }
  1.4682 +	    dst++;
  1.4683 +	    w--;
  1.4684 +	}
  1.4685 +    }
  1.4686 +}
  1.4687 +
  1.4688 +static pixman_bool_t
  1.4689 +sse2_blt (pixman_implementation_t *imp,
  1.4690 +          uint32_t *               src_bits,
  1.4691 +          uint32_t *               dst_bits,
  1.4692 +          int                      src_stride,
  1.4693 +          int                      dst_stride,
  1.4694 +          int                      src_bpp,
  1.4695 +          int                      dst_bpp,
  1.4696 +          int                      src_x,
  1.4697 +          int                      src_y,
  1.4698 +          int                      dest_x,
  1.4699 +          int                      dest_y,
  1.4700 +          int                      width,
  1.4701 +          int                      height)
  1.4702 +{
  1.4703 +    uint8_t *   src_bytes;
  1.4704 +    uint8_t *   dst_bytes;
  1.4705 +    int byte_width;
  1.4706 +
  1.4707 +    if (src_bpp != dst_bpp)
  1.4708 +	return FALSE;
  1.4709 +
  1.4710 +    if (src_bpp == 16)
  1.4711 +    {
  1.4712 +	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
  1.4713 +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
  1.4714 +	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
  1.4715 +	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
  1.4716 +	byte_width = 2 * width;
  1.4717 +	src_stride *= 2;
  1.4718 +	dst_stride *= 2;
  1.4719 +    }
  1.4720 +    else if (src_bpp == 32)
  1.4721 +    {
  1.4722 +	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
  1.4723 +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
  1.4724 +	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
  1.4725 +	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
  1.4726 +	byte_width = 4 * width;
  1.4727 +	src_stride *= 4;
  1.4728 +	dst_stride *= 4;
  1.4729 +    }
  1.4730 +    else
  1.4731 +    {
  1.4732 +	return FALSE;
  1.4733 +    }
  1.4734 +
  1.4735 +    while (height--)
  1.4736 +    {
  1.4737 +	int w;
  1.4738 +	uint8_t *s = src_bytes;
  1.4739 +	uint8_t *d = dst_bytes;
  1.4740 +	src_bytes += src_stride;
  1.4741 +	dst_bytes += dst_stride;
  1.4742 +	w = byte_width;
  1.4743 +
  1.4744 +	while (w >= 2 && ((uintptr_t)d & 3))
  1.4745 +	{
  1.4746 +	    *(uint16_t *)d = *(uint16_t *)s;
  1.4747 +	    w -= 2;
  1.4748 +	    s += 2;
  1.4749 +	    d += 2;
  1.4750 +	}
  1.4751 +
  1.4752 +	while (w >= 4 && ((uintptr_t)d & 15))
  1.4753 +	{
  1.4754 +	    *(uint32_t *)d = *(uint32_t *)s;
  1.4755 +
  1.4756 +	    w -= 4;
  1.4757 +	    s += 4;
  1.4758 +	    d += 4;
  1.4759 +	}
  1.4760 +
  1.4761 +	while (w >= 64)
  1.4762 +	{
  1.4763 +	    __m128i xmm0, xmm1, xmm2, xmm3;
  1.4764 +
  1.4765 +	    xmm0 = load_128_unaligned ((__m128i*)(s));
  1.4766 +	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
  1.4767 +	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
  1.4768 +	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
  1.4769 +
  1.4770 +	    save_128_aligned ((__m128i*)(d),    xmm0);
  1.4771 +	    save_128_aligned ((__m128i*)(d + 16), xmm1);
  1.4772 +	    save_128_aligned ((__m128i*)(d + 32), xmm2);
  1.4773 +	    save_128_aligned ((__m128i*)(d + 48), xmm3);
  1.4774 +
  1.4775 +	    s += 64;
  1.4776 +	    d += 64;
  1.4777 +	    w -= 64;
  1.4778 +	}
  1.4779 +
  1.4780 +	while (w >= 16)
  1.4781 +	{
  1.4782 +	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
  1.4783 +
  1.4784 +	    w -= 16;
  1.4785 +	    d += 16;
  1.4786 +	    s += 16;
  1.4787 +	}
  1.4788 +
  1.4789 +	while (w >= 4)
  1.4790 +	{
  1.4791 +	    *(uint32_t *)d = *(uint32_t *)s;
  1.4792 +
  1.4793 +	    w -= 4;
  1.4794 +	    s += 4;
  1.4795 +	    d += 4;
  1.4796 +	}
  1.4797 +
  1.4798 +	if (w >= 2)
  1.4799 +	{
  1.4800 +	    *(uint16_t *)d = *(uint16_t *)s;
  1.4801 +	    w -= 2;
  1.4802 +	    s += 2;
  1.4803 +	    d += 2;
  1.4804 +	}
  1.4805 +    }
  1.4806 +
  1.4807 +    return TRUE;
  1.4808 +}
  1.4809 +
  1.4810 +static void
  1.4811 +sse2_composite_copy_area (pixman_implementation_t *imp,
  1.4812 +                          pixman_composite_info_t *info)
  1.4813 +{
  1.4814 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4815 +    sse2_blt (imp, src_image->bits.bits,
  1.4816 +	      dest_image->bits.bits,
  1.4817 +	      src_image->bits.rowstride,
  1.4818 +	      dest_image->bits.rowstride,
  1.4819 +	      PIXMAN_FORMAT_BPP (src_image->bits.format),
  1.4820 +	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
  1.4821 +	      src_x, src_y, dest_x, dest_y, width, height);
  1.4822 +}
  1.4823 +
  1.4824 +static void
  1.4825 +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
  1.4826 +                                 pixman_composite_info_t *info)
  1.4827 +{
  1.4828 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4829 +    uint32_t    *src, *src_line, s;
  1.4830 +    uint32_t    *dst, *dst_line, d;
  1.4831 +    uint8_t         *mask, *mask_line;
  1.4832 +    uint32_t m;
  1.4833 +    int src_stride, mask_stride, dst_stride;
  1.4834 +    int32_t w;
  1.4835 +    __m128i ms;
  1.4836 +
  1.4837 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.4838 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.4839 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.4840 +
  1.4841 +    PIXMAN_IMAGE_GET_LINE (
  1.4842 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.4843 +    PIXMAN_IMAGE_GET_LINE (
  1.4844 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.4845 +    PIXMAN_IMAGE_GET_LINE (
  1.4846 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.4847 +
  1.4848 +    while (height--)
  1.4849 +    {
  1.4850 +        src = src_line;
  1.4851 +        src_line += src_stride;
  1.4852 +        dst = dst_line;
  1.4853 +        dst_line += dst_stride;
  1.4854 +        mask = mask_line;
  1.4855 +        mask_line += mask_stride;
  1.4856 +
  1.4857 +        w = width;
  1.4858 +
  1.4859 +        while (w && (uintptr_t)dst & 15)
  1.4860 +        {
  1.4861 +            s = 0xff000000 | *src++;
  1.4862 +            m = (uint32_t) *mask++;
  1.4863 +            d = *dst;
  1.4864 +            ms = unpack_32_1x128 (s);
  1.4865 +
  1.4866 +            if (m != 0xff)
  1.4867 +            {
  1.4868 +		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  1.4869 +		__m128i md = unpack_32_1x128 (d);
  1.4870 +
  1.4871 +                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
  1.4872 +            }
  1.4873 +
  1.4874 +            *dst++ = pack_1x128_32 (ms);
  1.4875 +            w--;
  1.4876 +        }
  1.4877 +
  1.4878 +        while (w >= 4)
  1.4879 +        {
  1.4880 +            m = *(uint32_t*) mask;
  1.4881 +            xmm_src = _mm_or_si128 (
  1.4882 +		load_128_unaligned ((__m128i*)src), mask_ff000000);
  1.4883 +
  1.4884 +            if (m == 0xffffffff)
  1.4885 +            {
  1.4886 +                save_128_aligned ((__m128i*)dst, xmm_src);
  1.4887 +            }
  1.4888 +            else
  1.4889 +            {
  1.4890 +                xmm_dst = load_128_aligned ((__m128i*)dst);
  1.4891 +
  1.4892 +                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
  1.4893 +
  1.4894 +                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.4895 +                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.4896 +                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.4897 +
  1.4898 +                expand_alpha_rev_2x128 (
  1.4899 +		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.4900 +
  1.4901 +                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.4902 +			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
  1.4903 +			       &xmm_dst_lo, &xmm_dst_hi);
  1.4904 +
  1.4905 +                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.4906 +            }
  1.4907 +
  1.4908 +            src += 4;
  1.4909 +            dst += 4;
  1.4910 +            mask += 4;
  1.4911 +            w -= 4;
  1.4912 +        }
  1.4913 +
  1.4914 +        while (w)
  1.4915 +        {
  1.4916 +            m = (uint32_t) *mask++;
  1.4917 +
  1.4918 +            if (m)
  1.4919 +            {
  1.4920 +                s = 0xff000000 | *src;
  1.4921 +
  1.4922 +                if (m == 0xff)
  1.4923 +                {
  1.4924 +                    *dst = s;
  1.4925 +                }
  1.4926 +                else
  1.4927 +                {
  1.4928 +		    __m128i ma, md, ms;
  1.4929 +
  1.4930 +                    d = *dst;
  1.4931 +
  1.4932 +		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
  1.4933 +		    md = unpack_32_1x128 (d);
  1.4934 +		    ms = unpack_32_1x128 (s);
  1.4935 +
  1.4936 +                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
  1.4937 +                }
  1.4938 +
  1.4939 +            }
  1.4940 +
  1.4941 +            src++;
  1.4942 +            dst++;
  1.4943 +            w--;
  1.4944 +        }
  1.4945 +    }
  1.4946 +
  1.4947 +}
  1.4948 +
  1.4949 +static void
  1.4950 +sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
  1.4951 +                                 pixman_composite_info_t *info)
  1.4952 +{
  1.4953 +    PIXMAN_COMPOSITE_ARGS (info);
  1.4954 +    uint32_t    *src, *src_line, s;
  1.4955 +    uint32_t    *dst, *dst_line, d;
  1.4956 +    uint8_t         *mask, *mask_line;
  1.4957 +    uint32_t m;
  1.4958 +    int src_stride, mask_stride, dst_stride;
  1.4959 +    int32_t w;
  1.4960 +
  1.4961 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
  1.4962 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.4963 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.4964 +
  1.4965 +    PIXMAN_IMAGE_GET_LINE (
  1.4966 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.4967 +    PIXMAN_IMAGE_GET_LINE (
  1.4968 +	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.4969 +    PIXMAN_IMAGE_GET_LINE (
  1.4970 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.4971 +
  1.4972 +    while (height--)
  1.4973 +    {
  1.4974 +        src = src_line;
  1.4975 +        src_line += src_stride;
  1.4976 +        dst = dst_line;
  1.4977 +        dst_line += dst_stride;
  1.4978 +        mask = mask_line;
  1.4979 +        mask_line += mask_stride;
  1.4980 +
  1.4981 +        w = width;
  1.4982 +
  1.4983 +        while (w && (uintptr_t)dst & 15)
  1.4984 +        {
  1.4985 +	    uint32_t sa;
  1.4986 +
  1.4987 +            s = *src++;
  1.4988 +            m = (uint32_t) *mask++;
  1.4989 +            d = *dst;
  1.4990 +
  1.4991 +	    sa = s >> 24;
  1.4992 +
  1.4993 +	    if (m)
  1.4994 +	    {
  1.4995 +		if (sa == 0xff && m == 0xff)
  1.4996 +		{
  1.4997 +		    *dst = s;
  1.4998 +		}
  1.4999 +		else
  1.5000 +		{
  1.5001 +		    __m128i ms, md, ma, msa;
  1.5002 +
  1.5003 +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  1.5004 +		    ms = unpack_32_1x128 (s);
  1.5005 +		    md = unpack_32_1x128 (d);
  1.5006 +
  1.5007 +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  1.5008 +
  1.5009 +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  1.5010 +		}
  1.5011 +	    }
  1.5012 +
  1.5013 +	    dst++;
  1.5014 +            w--;
  1.5015 +        }
  1.5016 +
  1.5017 +        while (w >= 4)
  1.5018 +        {
  1.5019 +            m = *(uint32_t *) mask;
  1.5020 +
  1.5021 +	    if (m)
  1.5022 +	    {
  1.5023 +		xmm_src = load_128_unaligned ((__m128i*)src);
  1.5024 +
  1.5025 +		if (m == 0xffffffff && is_opaque (xmm_src))
  1.5026 +		{
  1.5027 +		    save_128_aligned ((__m128i *)dst, xmm_src);
  1.5028 +		}
  1.5029 +		else
  1.5030 +		{
  1.5031 +		    xmm_dst = load_128_aligned ((__m128i *)dst);
  1.5032 +
  1.5033 +		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
  1.5034 +
  1.5035 +		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.5036 +		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.5037 +		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.5038 +
  1.5039 +		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
  1.5040 +		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.5041 +
  1.5042 +		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
  1.5043 +				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.5044 +
  1.5045 +		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.5046 +		}
  1.5047 +	    }
  1.5048 +
  1.5049 +            src += 4;
  1.5050 +            dst += 4;
  1.5051 +            mask += 4;
  1.5052 +            w -= 4;
  1.5053 +        }
  1.5054 +
  1.5055 +        while (w)
  1.5056 +        {
  1.5057 +	    uint32_t sa;
  1.5058 +
  1.5059 +            s = *src++;
  1.5060 +            m = (uint32_t) *mask++;
  1.5061 +            d = *dst;
  1.5062 +
  1.5063 +	    sa = s >> 24;
  1.5064 +
  1.5065 +	    if (m)
  1.5066 +	    {
  1.5067 +		if (sa == 0xff && m == 0xff)
  1.5068 +		{
  1.5069 +		    *dst = s;
  1.5070 +		}
  1.5071 +		else
  1.5072 +		{
  1.5073 +		    __m128i ms, md, ma, msa;
  1.5074 +
  1.5075 +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  1.5076 +		    ms = unpack_32_1x128 (s);
  1.5077 +		    md = unpack_32_1x128 (d);
  1.5078 +
  1.5079 +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  1.5080 +
  1.5081 +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  1.5082 +		}
  1.5083 +	    }
  1.5084 +
  1.5085 +	    dst++;
  1.5086 +            w--;
  1.5087 +        }
  1.5088 +    }
  1.5089 +
  1.5090 +}
  1.5091 +
  1.5092 +static void
  1.5093 +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
  1.5094 +				    pixman_composite_info_t *info)
  1.5095 +{
  1.5096 +    PIXMAN_COMPOSITE_ARGS (info);
  1.5097 +    uint32_t src;
  1.5098 +    uint32_t    *dst_line, *dst;
  1.5099 +    __m128i xmm_src;
  1.5100 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.5101 +    __m128i xmm_dsta_hi, xmm_dsta_lo;
  1.5102 +    int dst_stride;
  1.5103 +    int32_t w;
  1.5104 +
  1.5105 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.5106 +
  1.5107 +    if (src == 0)
  1.5108 +	return;
  1.5109 +
  1.5110 +    PIXMAN_IMAGE_GET_LINE (
  1.5111 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.5112 +
  1.5113 +    xmm_src = expand_pixel_32_1x128 (src);
  1.5114 +
  1.5115 +    while (height--)
  1.5116 +    {
  1.5117 +	dst = dst_line;
  1.5118 +
  1.5119 +	dst_line += dst_stride;
  1.5120 +	w = width;
  1.5121 +
  1.5122 +	while (w && (uintptr_t)dst & 15)
  1.5123 +	{
  1.5124 +	    __m128i vd;
  1.5125 +
  1.5126 +	    vd = unpack_32_1x128 (*dst);
  1.5127 +
  1.5128 +	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
  1.5129 +					      xmm_src));
  1.5130 +	    w--;
  1.5131 +	    dst++;
  1.5132 +	}
  1.5133 +
  1.5134 +	while (w >= 4)
  1.5135 +	{
  1.5136 +	    __m128i tmp_lo, tmp_hi;
  1.5137 +
  1.5138 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.5139 +
  1.5140 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.5141 +	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
  1.5142 +
  1.5143 +	    tmp_lo = xmm_src;
  1.5144 +	    tmp_hi = xmm_src;
  1.5145 +
  1.5146 +	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
  1.5147 +			&xmm_dsta_lo, &xmm_dsta_hi,
  1.5148 +			&tmp_lo, &tmp_hi);
  1.5149 +
  1.5150 +	    save_128_aligned (
  1.5151 +		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
  1.5152 +
  1.5153 +	    w -= 4;
  1.5154 +	    dst += 4;
  1.5155 +	}
  1.5156 +
  1.5157 +	while (w)
  1.5158 +	{
  1.5159 +	    __m128i vd;
  1.5160 +
  1.5161 +	    vd = unpack_32_1x128 (*dst);
  1.5162 +
  1.5163 +	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
  1.5164 +					      xmm_src));
  1.5165 +	    w--;
  1.5166 +	    dst++;
  1.5167 +	}
  1.5168 +
  1.5169 +    }
  1.5170 +
  1.5171 +}
  1.5172 +
  1.5173 +static void
  1.5174 +sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
  1.5175 +				    pixman_composite_info_t *info)
  1.5176 +{
  1.5177 +    PIXMAN_COMPOSITE_ARGS (info);
  1.5178 +    uint32_t    *src, *src_line, s;
  1.5179 +    uint32_t    *dst, *dst_line, d;
  1.5180 +    uint32_t    *mask, *mask_line;
  1.5181 +    uint32_t    m;
  1.5182 +    int src_stride, mask_stride, dst_stride;
  1.5183 +    int32_t w;
  1.5184 +
  1.5185 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
  1.5186 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.5187 +    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.5188 +
  1.5189 +    PIXMAN_IMAGE_GET_LINE (
  1.5190 +	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.5191 +    PIXMAN_IMAGE_GET_LINE (
  1.5192 +	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  1.5193 +    PIXMAN_IMAGE_GET_LINE (
  1.5194 +	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.5195 +
  1.5196 +    while (height--)
  1.5197 +    {
  1.5198 +        src = src_line;
  1.5199 +        src_line += src_stride;
  1.5200 +        dst = dst_line;
  1.5201 +        dst_line += dst_stride;
  1.5202 +        mask = mask_line;
  1.5203 +        mask_line += mask_stride;
  1.5204 +
  1.5205 +        w = width;
  1.5206 +
  1.5207 +        while (w && (uintptr_t)dst & 15)
  1.5208 +        {
  1.5209 +	    uint32_t sa;
  1.5210 +
  1.5211 +            s = *src++;
  1.5212 +            m = (*mask++) >> 24;
  1.5213 +            d = *dst;
  1.5214 +
  1.5215 +	    sa = s >> 24;
  1.5216 +
  1.5217 +	    if (m)
  1.5218 +	    {
  1.5219 +		if (sa == 0xff && m == 0xff)
  1.5220 +		{
  1.5221 +		    *dst = s;
  1.5222 +		}
  1.5223 +		else
  1.5224 +		{
  1.5225 +		    __m128i ms, md, ma, msa;
  1.5226 +
  1.5227 +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  1.5228 +		    ms = unpack_32_1x128 (s);
  1.5229 +		    md = unpack_32_1x128 (d);
  1.5230 +
  1.5231 +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  1.5232 +
  1.5233 +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  1.5234 +		}
  1.5235 +	    }
  1.5236 +
  1.5237 +	    dst++;
  1.5238 +            w--;
  1.5239 +        }
  1.5240 +
  1.5241 +        while (w >= 4)
  1.5242 +        {
  1.5243 +	    xmm_mask = load_128_unaligned ((__m128i*)mask);
  1.5244 +
  1.5245 +	    if (!is_transparent (xmm_mask))
  1.5246 +	    {
  1.5247 +		xmm_src = load_128_unaligned ((__m128i*)src);
  1.5248 +
  1.5249 +		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
  1.5250 +		{
  1.5251 +		    save_128_aligned ((__m128i *)dst, xmm_src);
  1.5252 +		}
  1.5253 +		else
  1.5254 +		{
  1.5255 +		    xmm_dst = load_128_aligned ((__m128i *)dst);
  1.5256 +
  1.5257 +		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.5258 +		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.5259 +		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.5260 +
  1.5261 +		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
  1.5262 +		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.5263 +
  1.5264 +		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
  1.5265 +				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.5266 +
  1.5267 +		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.5268 +		}
  1.5269 +	    }
  1.5270 +
  1.5271 +            src += 4;
  1.5272 +            dst += 4;
  1.5273 +            mask += 4;
  1.5274 +            w -= 4;
  1.5275 +        }
  1.5276 +
  1.5277 +        while (w)
  1.5278 +        {
  1.5279 +	    uint32_t sa;
  1.5280 +
  1.5281 +            s = *src++;
  1.5282 +            m = (*mask++) >> 24;
  1.5283 +            d = *dst;
  1.5284 +
  1.5285 +	    sa = s >> 24;
  1.5286 +
  1.5287 +	    if (m)
  1.5288 +	    {
  1.5289 +		if (sa == 0xff && m == 0xff)
  1.5290 +		{
  1.5291 +		    *dst = s;
  1.5292 +		}
  1.5293 +		else
  1.5294 +		{
  1.5295 +		    __m128i ms, md, ma, msa;
  1.5296 +
  1.5297 +		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  1.5298 +		    ms = unpack_32_1x128 (s);
  1.5299 +		    md = unpack_32_1x128 (d);
  1.5300 +
  1.5301 +		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  1.5302 +
  1.5303 +		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  1.5304 +		}
  1.5305 +	    }
  1.5306 +
  1.5307 +	    dst++;
  1.5308 +            w--;
  1.5309 +        }
  1.5310 +    }
  1.5311 +
  1.5312 +}
  1.5313 +
  1.5314 +/* A variant of 'sse2_combine_over_u' with minor tweaks */
  1.5315 +static force_inline void
  1.5316 +scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
  1.5317 +                                             const uint32_t* ps,
  1.5318 +                                             int32_t         w,
  1.5319 +                                             pixman_fixed_t  vx,
  1.5320 +                                             pixman_fixed_t  unit_x,
  1.5321 +                                             pixman_fixed_t  src_width_fixed,
  1.5322 +                                             pixman_bool_t   fully_transparent_src)
  1.5323 +{
  1.5324 +    uint32_t s, d;
  1.5325 +    const uint32_t* pm = NULL;
  1.5326 +
  1.5327 +    __m128i xmm_dst_lo, xmm_dst_hi;
  1.5328 +    __m128i xmm_src_lo, xmm_src_hi;
  1.5329 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.5330 +
  1.5331 +    if (fully_transparent_src)
  1.5332 +	return;
  1.5333 +
  1.5334 +    /* Align dst on a 16-byte boundary */
  1.5335 +    while (w && ((uintptr_t)pd & 15))
  1.5336 +    {
  1.5337 +	d = *pd;
  1.5338 +	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
  1.5339 +	vx += unit_x;
  1.5340 +	while (vx >= 0)
  1.5341 +	    vx -= src_width_fixed;
  1.5342 +
  1.5343 +	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
  1.5344 +	if (pm)
  1.5345 +	    pm++;
  1.5346 +	w--;
  1.5347 +    }
  1.5348 +
  1.5349 +    while (w >= 4)
  1.5350 +    {
  1.5351 +	__m128i tmp;
  1.5352 +	uint32_t tmp1, tmp2, tmp3, tmp4;
  1.5353 +
  1.5354 +	tmp1 = *(ps + pixman_fixed_to_int (vx));
  1.5355 +	vx += unit_x;
  1.5356 +	while (vx >= 0)
  1.5357 +	    vx -= src_width_fixed;
  1.5358 +	tmp2 = *(ps + pixman_fixed_to_int (vx));
  1.5359 +	vx += unit_x;
  1.5360 +	while (vx >= 0)
  1.5361 +	    vx -= src_width_fixed;
  1.5362 +	tmp3 = *(ps + pixman_fixed_to_int (vx));
  1.5363 +	vx += unit_x;
  1.5364 +	while (vx >= 0)
  1.5365 +	    vx -= src_width_fixed;
  1.5366 +	tmp4 = *(ps + pixman_fixed_to_int (vx));
  1.5367 +	vx += unit_x;
  1.5368 +	while (vx >= 0)
  1.5369 +	    vx -= src_width_fixed;
  1.5370 +
  1.5371 +	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
  1.5372 +
  1.5373 +	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
  1.5374 +
  1.5375 +	if (is_opaque (xmm_src_hi))
  1.5376 +	{
  1.5377 +	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
  1.5378 +	}
  1.5379 +	else if (!is_zero (xmm_src_hi))
  1.5380 +	{
  1.5381 +	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
  1.5382 +
  1.5383 +	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
  1.5384 +	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.5385 +
  1.5386 +	    expand_alpha_2x128 (
  1.5387 +		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
  1.5388 +
  1.5389 +	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.5390 +			&xmm_alpha_lo, &xmm_alpha_hi,
  1.5391 +			&xmm_dst_lo, &xmm_dst_hi);
  1.5392 +
  1.5393 +	    /* rebuid the 4 pixel data and save*/
  1.5394 +	    save_128_aligned ((__m128i*)pd,
  1.5395 +			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.5396 +	}
  1.5397 +
  1.5398 +	w -= 4;
  1.5399 +	pd += 4;
  1.5400 +	if (pm)
  1.5401 +	    pm += 4;
  1.5402 +    }
  1.5403 +
  1.5404 +    while (w)
  1.5405 +    {
  1.5406 +	d = *pd;
  1.5407 +	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
  1.5408 +	vx += unit_x;
  1.5409 +	while (vx >= 0)
  1.5410 +	    vx -= src_width_fixed;
  1.5411 +
  1.5412 +	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
  1.5413 +	if (pm)
  1.5414 +	    pm++;
  1.5415 +
  1.5416 +	w--;
  1.5417 +    }
  1.5418 +}
  1.5419 +
  1.5420 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
  1.5421 +		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  1.5422 +		       uint32_t, uint32_t, COVER)
  1.5423 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
  1.5424 +		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  1.5425 +		       uint32_t, uint32_t, NONE)
  1.5426 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
  1.5427 +		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  1.5428 +		       uint32_t, uint32_t, PAD)
  1.5429 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
  1.5430 +		       scaled_nearest_scanline_sse2_8888_8888_OVER,
  1.5431 +		       uint32_t, uint32_t, NORMAL)
  1.5432 +
  1.5433 +static force_inline void
  1.5434 +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
  1.5435 +					       uint32_t *       dst,
  1.5436 +					       const uint32_t * src,
  1.5437 +					       int32_t          w,
  1.5438 +					       pixman_fixed_t   vx,
  1.5439 +					       pixman_fixed_t   unit_x,
  1.5440 +					       pixman_fixed_t   src_width_fixed,
  1.5441 +					       pixman_bool_t    zero_src)
  1.5442 +{
  1.5443 +    __m128i xmm_mask;
  1.5444 +    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.5445 +    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.5446 +    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.5447 +
  1.5448 +    if (zero_src || (*mask >> 24) == 0)
  1.5449 +	return;
  1.5450 +
  1.5451 +    xmm_mask = create_mask_16_128 (*mask >> 24);
  1.5452 +
  1.5453 +    while (w && (uintptr_t)dst & 15)
  1.5454 +    {
  1.5455 +	uint32_t s = *(src + pixman_fixed_to_int (vx));
  1.5456 +	vx += unit_x;
  1.5457 +	while (vx >= 0)
  1.5458 +	    vx -= src_width_fixed;
  1.5459 +
  1.5460 +	if (s)
  1.5461 +	{
  1.5462 +	    uint32_t d = *dst;
  1.5463 +
  1.5464 +	    __m128i ms = unpack_32_1x128 (s);
  1.5465 +	    __m128i alpha     = expand_alpha_1x128 (ms);
  1.5466 +	    __m128i dest      = xmm_mask;
  1.5467 +	    __m128i alpha_dst = unpack_32_1x128 (d);
  1.5468 +
  1.5469 +	    *dst = pack_1x128_32 (
  1.5470 +		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  1.5471 +	}
  1.5472 +	dst++;
  1.5473 +	w--;
  1.5474 +    }
  1.5475 +
  1.5476 +    while (w >= 4)
  1.5477 +    {
  1.5478 +	uint32_t tmp1, tmp2, tmp3, tmp4;
  1.5479 +
  1.5480 +	tmp1 = *(src + pixman_fixed_to_int (vx));
  1.5481 +	vx += unit_x;
  1.5482 +	while (vx >= 0)
  1.5483 +	    vx -= src_width_fixed;
  1.5484 +	tmp2 = *(src + pixman_fixed_to_int (vx));
  1.5485 +	vx += unit_x;
  1.5486 +	while (vx >= 0)
  1.5487 +	    vx -= src_width_fixed;
  1.5488 +	tmp3 = *(src + pixman_fixed_to_int (vx));
  1.5489 +	vx += unit_x;
  1.5490 +	while (vx >= 0)
  1.5491 +	    vx -= src_width_fixed;
  1.5492 +	tmp4 = *(src + pixman_fixed_to_int (vx));
  1.5493 +	vx += unit_x;
  1.5494 +	while (vx >= 0)
  1.5495 +	    vx -= src_width_fixed;
  1.5496 +
  1.5497 +	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
  1.5498 +
  1.5499 +	if (!is_zero (xmm_src))
  1.5500 +	{
  1.5501 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.5502 +
  1.5503 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.5504 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.5505 +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.5506 +			        &xmm_alpha_lo, &xmm_alpha_hi);
  1.5507 +
  1.5508 +	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.5509 +			   &xmm_alpha_lo, &xmm_alpha_hi,
  1.5510 +			   &xmm_mask, &xmm_mask,
  1.5511 +			   &xmm_dst_lo, &xmm_dst_hi);
  1.5512 +
  1.5513 +	    save_128_aligned (
  1.5514 +		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.5515 +	}
  1.5516 +
  1.5517 +	dst += 4;
  1.5518 +	w -= 4;
  1.5519 +    }
  1.5520 +
  1.5521 +    while (w)
  1.5522 +    {
  1.5523 +	uint32_t s = *(src + pixman_fixed_to_int (vx));
  1.5524 +	vx += unit_x;
  1.5525 +	while (vx >= 0)
  1.5526 +	    vx -= src_width_fixed;
  1.5527 +
  1.5528 +	if (s)
  1.5529 +	{
  1.5530 +	    uint32_t d = *dst;
  1.5531 +
  1.5532 +	    __m128i ms = unpack_32_1x128 (s);
  1.5533 +	    __m128i alpha = expand_alpha_1x128 (ms);
  1.5534 +	    __m128i mask  = xmm_mask;
  1.5535 +	    __m128i dest  = unpack_32_1x128 (d);
  1.5536 +
  1.5537 +	    *dst = pack_1x128_32 (
  1.5538 +		in_over_1x128 (&ms, &alpha, &mask, &dest));
  1.5539 +	}
  1.5540 +
  1.5541 +	dst++;
  1.5542 +	w--;
  1.5543 +    }
  1.5544 +
  1.5545 +}
  1.5546 +
  1.5547 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
  1.5548 +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  1.5549 +			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
  1.5550 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
  1.5551 +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  1.5552 +			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
  1.5553 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
  1.5554 +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  1.5555 +			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
  1.5556 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
  1.5557 +			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
  1.5558 +			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
  1.5559 +
  1.5560 +#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
  1.5561 +
  1.5562 +#define BILINEAR_DECLARE_VARIABLES						\
  1.5563 +    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
  1.5564 +    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
  1.5565 +    const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
  1.5566 +    const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
  1.5567 +    const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
  1.5568 +    const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
  1.5569 +    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
  1.5570 +					  unit_x, unit_x, unit_x, unit_x);	\
  1.5571 +    const __m128i xmm_zero = _mm_setzero_si128 ();				\
  1.5572 +    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
  1.5573 +
  1.5574 +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
  1.5575 +do {										\
  1.5576 +    __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
  1.5577 +    /* fetch 2x2 pixel block into sse2 registers */				\
  1.5578 +    __m128i tltr = _mm_loadl_epi64 (						\
  1.5579 +			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
  1.5580 +    __m128i blbr = _mm_loadl_epi64 (						\
  1.5581 +			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
  1.5582 +    vx += unit_x;								\
  1.5583 +    /* vertical interpolation */						\
  1.5584 +    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
  1.5585 +					xmm_wt),				\
  1.5586 +		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
  1.5587 +					xmm_wb));				\
  1.5588 +    if (BILINEAR_INTERPOLATION_BITS < 8)					\
  1.5589 +    {										\
  1.5590 +	/* calculate horizontal weights */					\
  1.5591 +	xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7,		\
  1.5592 +		   _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
  1.5593 +	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
  1.5594 +	/* horizontal interpolation */						\
  1.5595 +	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
  1.5596 +		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
  1.5597 +    }										\
  1.5598 +    else									\
  1.5599 +    {										\
  1.5600 +	/* calculate horizontal weights */					\
  1.5601 +	xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8,		\
  1.5602 +		_mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS)));	\
  1.5603 +	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
  1.5604 +	/* horizontal interpolation */						\
  1.5605 +	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
  1.5606 +	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
  1.5607 +	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
  1.5608 +			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
  1.5609 +    }										\
  1.5610 +    /* shift and pack the result */						\
  1.5611 +    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
  1.5612 +    a = _mm_packs_epi32 (a, a);							\
  1.5613 +    a = _mm_packus_epi16 (a, a);						\
  1.5614 +    pix = _mm_cvtsi128_si32 (a);						\
  1.5615 +} while (0)
  1.5616 +
  1.5617 +#define BILINEAR_SKIP_ONE_PIXEL()						\
  1.5618 +do {										\
  1.5619 +    vx += unit_x;								\
  1.5620 +    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
  1.5621 +} while(0)
  1.5622 +
  1.5623 +static force_inline void
  1.5624 +scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
  1.5625 +					     const uint32_t * mask,
  1.5626 +					     const uint32_t * src_top,
  1.5627 +					     const uint32_t * src_bottom,
  1.5628 +					     int32_t          w,
  1.5629 +					     int              wt,
  1.5630 +					     int              wb,
  1.5631 +					     pixman_fixed_t   vx,
  1.5632 +					     pixman_fixed_t   unit_x,
  1.5633 +					     pixman_fixed_t   max_vx,
  1.5634 +					     pixman_bool_t    zero_src)
  1.5635 +{
  1.5636 +    BILINEAR_DECLARE_VARIABLES;
  1.5637 +    uint32_t pix1, pix2, pix3, pix4;
  1.5638 +
  1.5639 +    while ((w -= 4) >= 0)
  1.5640 +    {
  1.5641 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5642 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  1.5643 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  1.5644 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  1.5645 +	*dst++ = pix1;
  1.5646 +	*dst++ = pix2;
  1.5647 +	*dst++ = pix3;
  1.5648 +	*dst++ = pix4;
  1.5649 +    }
  1.5650 +
  1.5651 +    if (w & 2)
  1.5652 +    {
  1.5653 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5654 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  1.5655 +	*dst++ = pix1;
  1.5656 +	*dst++ = pix2;
  1.5657 +    }
  1.5658 +
  1.5659 +    if (w & 1)
  1.5660 +    {
  1.5661 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5662 +	*dst = pix1;
  1.5663 +    }
  1.5664 +
  1.5665 +}
  1.5666 +
  1.5667 +/* Add extra NULL argument to the existing bilinear fast paths to indicate
  1.5668 + * that we don't need two-pass processing */
  1.5669 +
  1.5670 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
  1.5671 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  1.5672 +			       uint32_t, uint32_t, uint32_t,
  1.5673 +			       COVER, FLAG_NONE)
  1.5674 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
  1.5675 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  1.5676 +			       uint32_t, uint32_t, uint32_t,
  1.5677 +			       PAD, FLAG_NONE)
  1.5678 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
  1.5679 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  1.5680 +			       uint32_t, uint32_t, uint32_t,
  1.5681 +			       NONE, FLAG_NONE)
  1.5682 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
  1.5683 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
  1.5684 +			       uint32_t, uint32_t, uint32_t,
  1.5685 +			       NORMAL, FLAG_NONE)
  1.5686 +
  1.5687 +static force_inline void
  1.5688 +scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
  1.5689 +					      const uint32_t * mask,
  1.5690 +					      const uint32_t * src_top,
  1.5691 +					      const uint32_t * src_bottom,
  1.5692 +					      int32_t          w,
  1.5693 +					      int              wt,
  1.5694 +					      int              wb,
  1.5695 +					      pixman_fixed_t   vx,
  1.5696 +					      pixman_fixed_t   unit_x,
  1.5697 +					      pixman_fixed_t   max_vx,
  1.5698 +					      pixman_bool_t    zero_src)
  1.5699 +{
  1.5700 +    BILINEAR_DECLARE_VARIABLES;
  1.5701 +    uint32_t pix1, pix2, pix3, pix4;
  1.5702 +
  1.5703 +    while (w && ((uintptr_t)dst & 15))
  1.5704 +    {
  1.5705 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5706 +
  1.5707 +	if (pix1)
  1.5708 +	{
  1.5709 +	    pix2 = *dst;
  1.5710 +	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
  1.5711 +	}
  1.5712 +
  1.5713 +	w--;
  1.5714 +	dst++;
  1.5715 +    }
  1.5716 +
  1.5717 +    while (w  >= 4)
  1.5718 +    {
  1.5719 +	__m128i xmm_src;
  1.5720 +	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
  1.5721 +	__m128i xmm_alpha_hi, xmm_alpha_lo;
  1.5722 +
  1.5723 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5724 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  1.5725 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  1.5726 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  1.5727 +
  1.5728 +	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
  1.5729 +
  1.5730 +	if (!is_zero (xmm_src))
  1.5731 +	{
  1.5732 +	    if (is_opaque (xmm_src))
  1.5733 +	    {
  1.5734 +		save_128_aligned ((__m128i *)dst, xmm_src);
  1.5735 +	    }
  1.5736 +	    else
  1.5737 +	    {
  1.5738 +		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
  1.5739 +
  1.5740 +		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.5741 +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.5742 +
  1.5743 +		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
  1.5744 +		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
  1.5745 +			    &xmm_dst_lo, &xmm_dst_hi);
  1.5746 +
  1.5747 +		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.5748 +	    }
  1.5749 +	}
  1.5750 +
  1.5751 +	w -= 4;
  1.5752 +	dst += 4;
  1.5753 +    }
  1.5754 +
  1.5755 +    while (w)
  1.5756 +    {
  1.5757 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5758 +
  1.5759 +	if (pix1)
  1.5760 +	{
  1.5761 +	    pix2 = *dst;
  1.5762 +	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
  1.5763 +	}
  1.5764 +
  1.5765 +	w--;
  1.5766 +	dst++;
  1.5767 +    }
  1.5768 +}
  1.5769 +
  1.5770 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
  1.5771 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  1.5772 +			       uint32_t, uint32_t, uint32_t,
  1.5773 +			       COVER, FLAG_NONE)
  1.5774 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
  1.5775 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  1.5776 +			       uint32_t, uint32_t, uint32_t,
  1.5777 +			       PAD, FLAG_NONE)
  1.5778 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
  1.5779 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  1.5780 +			       uint32_t, uint32_t, uint32_t,
  1.5781 +			       NONE, FLAG_NONE)
  1.5782 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
  1.5783 +			       scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
  1.5784 +			       uint32_t, uint32_t, uint32_t,
  1.5785 +			       NORMAL, FLAG_NONE)
  1.5786 +
  1.5787 +
  1.5788 +/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
  1.5789 +   as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
  1.5790 +
  1.5791 +void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
  1.5792 +{
  1.5793 +    /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
  1.5794 +    while (--width >= 0)
  1.5795 +    {
  1.5796 +	*dst = composite_over_8888_0565pixel (*src, *dst);
  1.5797 +	src++;
  1.5798 +	dst++;
  1.5799 +    }
  1.5800 +}
  1.5801 +
  1.5802 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
  1.5803 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  1.5804 +			       uint32_t, uint32_t, uint16_t,
  1.5805 +			       COVER, FLAG_NONE)
  1.5806 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
  1.5807 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  1.5808 +			       uint32_t, uint32_t, uint16_t,
  1.5809 +			       PAD, FLAG_NONE)
  1.5810 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
  1.5811 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  1.5812 +			       uint32_t, uint32_t, uint16_t,
  1.5813 +			       NONE, FLAG_NONE)
  1.5814 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
  1.5815 +			       scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
  1.5816 +			       uint32_t, uint32_t, uint16_t,
  1.5817 +			       NORMAL, FLAG_NONE)
  1.5818 +
  1.5819 +/*****************************/
  1.5820 +
  1.5821 +static force_inline void
  1.5822 +scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
  1.5823 +						const uint8_t  * mask,
  1.5824 +						const uint32_t * src_top,
  1.5825 +						const uint32_t * src_bottom,
  1.5826 +						int32_t          w,
  1.5827 +						int              wt,
  1.5828 +						int              wb,
  1.5829 +						pixman_fixed_t   vx,
  1.5830 +						pixman_fixed_t   unit_x,
  1.5831 +						pixman_fixed_t   max_vx,
  1.5832 +						pixman_bool_t    zero_src)
  1.5833 +{
  1.5834 +    BILINEAR_DECLARE_VARIABLES;
  1.5835 +    uint32_t pix1, pix2, pix3, pix4;
  1.5836 +    uint32_t m;
  1.5837 +
  1.5838 +    while (w && ((uintptr_t)dst & 15))
  1.5839 +    {
  1.5840 +	uint32_t sa;
  1.5841 +
  1.5842 +	m = (uint32_t) *mask++;
  1.5843 +
  1.5844 +	if (m)
  1.5845 +	{
  1.5846 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5847 +	    sa = pix1 >> 24;
  1.5848 +
  1.5849 +	    if (sa == 0xff && m == 0xff)
  1.5850 +	    {
  1.5851 +		*dst = pix1;
  1.5852 +	    }
  1.5853 +	    else
  1.5854 +	    {
  1.5855 +		__m128i ms, md, ma, msa;
  1.5856 +
  1.5857 +		pix2 = *dst;
  1.5858 +		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  1.5859 +		ms = unpack_32_1x128 (pix1);
  1.5860 +		md = unpack_32_1x128 (pix2);
  1.5861 +
  1.5862 +		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  1.5863 +
  1.5864 +		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  1.5865 +	    }
  1.5866 +	}
  1.5867 +	else
  1.5868 +	{
  1.5869 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.5870 +	}
  1.5871 +
  1.5872 +	w--;
  1.5873 +	dst++;
  1.5874 +    }
  1.5875 +
  1.5876 +    while (w >= 4)
  1.5877 +    {
  1.5878 +	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
  1.5879 +	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.5880 +	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
  1.5881 +
  1.5882 +	m = *(uint32_t*)mask;
  1.5883 +
  1.5884 +	if (m)
  1.5885 +	{
  1.5886 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5887 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  1.5888 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  1.5889 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  1.5890 +
  1.5891 +	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
  1.5892 +
  1.5893 +	    if (m == 0xffffffff && is_opaque (xmm_src))
  1.5894 +	    {
  1.5895 +		save_128_aligned ((__m128i *)dst, xmm_src);
  1.5896 +	    }
  1.5897 +	    else
  1.5898 +	    {
  1.5899 +		xmm_dst = load_128_aligned ((__m128i *)dst);
  1.5900 +
  1.5901 +		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
  1.5902 +
  1.5903 +		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.5904 +		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
  1.5905 +		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.5906 +
  1.5907 +		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
  1.5908 +		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
  1.5909 +
  1.5910 +		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
  1.5911 +			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
  1.5912 +
  1.5913 +		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.5914 +	    }
  1.5915 +	}
  1.5916 +	else
  1.5917 +	{
  1.5918 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.5919 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.5920 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.5921 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.5922 +	}
  1.5923 +
  1.5924 +	w -= 4;
  1.5925 +	dst += 4;
  1.5926 +	mask += 4;
  1.5927 +    }
  1.5928 +
  1.5929 +    while (w)
  1.5930 +    {
  1.5931 +	uint32_t sa;
  1.5932 +
  1.5933 +	m = (uint32_t) *mask++;
  1.5934 +
  1.5935 +	if (m)
  1.5936 +	{
  1.5937 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.5938 +	    sa = pix1 >> 24;
  1.5939 +
  1.5940 +	    if (sa == 0xff && m == 0xff)
  1.5941 +	    {
  1.5942 +		*dst = pix1;
  1.5943 +	    }
  1.5944 +	    else
  1.5945 +	    {
  1.5946 +		__m128i ms, md, ma, msa;
  1.5947 +
  1.5948 +		pix2 = *dst;
  1.5949 +		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
  1.5950 +		ms = unpack_32_1x128 (pix1);
  1.5951 +		md = unpack_32_1x128 (pix2);
  1.5952 +
  1.5953 +		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
  1.5954 +
  1.5955 +		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
  1.5956 +	    }
  1.5957 +	}
  1.5958 +	else
  1.5959 +	{
  1.5960 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.5961 +	}
  1.5962 +
  1.5963 +	w--;
  1.5964 +	dst++;
  1.5965 +    }
  1.5966 +}
  1.5967 +
  1.5968 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
  1.5969 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  1.5970 +			       uint32_t, uint8_t, uint32_t,
  1.5971 +			       COVER, FLAG_HAVE_NON_SOLID_MASK)
  1.5972 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
  1.5973 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  1.5974 +			       uint32_t, uint8_t, uint32_t,
  1.5975 +			       PAD, FLAG_HAVE_NON_SOLID_MASK)
  1.5976 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
  1.5977 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  1.5978 +			       uint32_t, uint8_t, uint32_t,
  1.5979 +			       NONE, FLAG_HAVE_NON_SOLID_MASK)
  1.5980 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
  1.5981 +			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
  1.5982 +			       uint32_t, uint8_t, uint32_t,
  1.5983 +			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
  1.5984 +
  1.5985 +static force_inline void
  1.5986 +scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
  1.5987 +						const uint32_t * mask,
  1.5988 +						const uint32_t * src_top,
  1.5989 +						const uint32_t * src_bottom,
  1.5990 +						int32_t          w,
  1.5991 +						int              wt,
  1.5992 +						int              wb,
  1.5993 +						pixman_fixed_t   vx,
  1.5994 +						pixman_fixed_t   unit_x,
  1.5995 +						pixman_fixed_t   max_vx,
  1.5996 +						pixman_bool_t    zero_src)
  1.5997 +{
  1.5998 +    BILINEAR_DECLARE_VARIABLES;
  1.5999 +    uint32_t pix1, pix2, pix3, pix4;
  1.6000 +    __m128i xmm_mask;
  1.6001 +
  1.6002 +    if (zero_src || (*mask >> 24) == 0)
  1.6003 +	return;
  1.6004 +
  1.6005 +    xmm_mask = create_mask_16_128 (*mask >> 24);
  1.6006 +
  1.6007 +    while (w && ((uintptr_t)dst & 15))
  1.6008 +    {
  1.6009 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.6010 +	if (pix1)
  1.6011 +	{
  1.6012 +		uint32_t d = *dst;
  1.6013 +
  1.6014 +		__m128i ms = unpack_32_1x128 (pix1);
  1.6015 +		__m128i alpha     = expand_alpha_1x128 (ms);
  1.6016 +		__m128i dest      = xmm_mask;
  1.6017 +		__m128i alpha_dst = unpack_32_1x128 (d);
  1.6018 +
  1.6019 +		*dst = pack_1x128_32
  1.6020 +			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  1.6021 +	}
  1.6022 +
  1.6023 +	dst++;
  1.6024 +	w--;
  1.6025 +    }
  1.6026 +
  1.6027 +    while (w >= 4)
  1.6028 +    {
  1.6029 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.6030 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
  1.6031 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
  1.6032 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
  1.6033 +
  1.6034 +	if (pix1 | pix2 | pix3 | pix4)
  1.6035 +	{
  1.6036 +	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
  1.6037 +	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
  1.6038 +	    __m128i xmm_alpha_lo, xmm_alpha_hi;
  1.6039 +
  1.6040 +	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
  1.6041 +
  1.6042 +	    xmm_dst = load_128_aligned ((__m128i*)dst);
  1.6043 +
  1.6044 +	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
  1.6045 +	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
  1.6046 +	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
  1.6047 +				&xmm_alpha_lo, &xmm_alpha_hi);
  1.6048 +
  1.6049 +	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
  1.6050 +			   &xmm_alpha_lo, &xmm_alpha_hi,
  1.6051 +			   &xmm_mask, &xmm_mask,
  1.6052 +			   &xmm_dst_lo, &xmm_dst_hi);
  1.6053 +
  1.6054 +	    save_128_aligned
  1.6055 +		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
  1.6056 +	}
  1.6057 +
  1.6058 +	dst += 4;
  1.6059 +	w -= 4;
  1.6060 +    }
  1.6061 +
  1.6062 +    while (w)
  1.6063 +    {
  1.6064 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.6065 +	if (pix1)
  1.6066 +	{
  1.6067 +		uint32_t d = *dst;
  1.6068 +
  1.6069 +		__m128i ms = unpack_32_1x128 (pix1);
  1.6070 +		__m128i alpha     = expand_alpha_1x128 (ms);
  1.6071 +		__m128i dest      = xmm_mask;
  1.6072 +		__m128i alpha_dst = unpack_32_1x128 (d);
  1.6073 +
  1.6074 +		*dst = pack_1x128_32
  1.6075 +			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
  1.6076 +	}
  1.6077 +
  1.6078 +	dst++;
  1.6079 +	w--;
  1.6080 +    }
  1.6081 +}
  1.6082 +
  1.6083 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
  1.6084 +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  1.6085 +			       uint32_t, uint32_t, uint32_t,
  1.6086 +			       COVER, FLAG_HAVE_SOLID_MASK)
  1.6087 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
  1.6088 +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  1.6089 +			       uint32_t, uint32_t, uint32_t,
  1.6090 +			       PAD, FLAG_HAVE_SOLID_MASK)
  1.6091 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
  1.6092 +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  1.6093 +			       uint32_t, uint32_t, uint32_t,
  1.6094 +			       NONE, FLAG_HAVE_SOLID_MASK)
  1.6095 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
  1.6096 +			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
  1.6097 +			       uint32_t, uint32_t, uint32_t,
  1.6098 +			       NORMAL, FLAG_HAVE_SOLID_MASK)
  1.6099 +
  1.6100 +static const pixman_fast_path_t sse2_fast_paths[] =
  1.6101 +{
  1.6102 +    /* PIXMAN_OP_OVER */
  1.6103 +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
  1.6104 +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
  1.6105 +    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
  1.6106 +    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
  1.6107 +    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
  1.6108 +    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
  1.6109 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
  1.6110 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
  1.6111 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
  1.6112 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
  1.6113 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
  1.6114 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
  1.6115 +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
  1.6116 +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
  1.6117 +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
  1.6118 +    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
  1.6119 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
  1.6120 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
  1.6121 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
  1.6122 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
  1.6123 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
  1.6124 +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
  1.6125 +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
  1.6126 +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
  1.6127 +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
  1.6128 +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
  1.6129 +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
  1.6130 +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
  1.6131 +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
  1.6132 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
  1.6133 +    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
  1.6134 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
  1.6135 +    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
  1.6136 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
  1.6137 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
  1.6138 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
  1.6139 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
  1.6140 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
  1.6141 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
  1.6142 +    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
  1.6143 +    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
  1.6144 +    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
  1.6145 +    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
  1.6146 +    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
  1.6147 +    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
  1.6148 +    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
  1.6149 +    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
  1.6150 +    
  1.6151 +    /* PIXMAN_OP_OVER_REVERSE */
  1.6152 +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
  1.6153 +    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
  1.6154 +
  1.6155 +    /* PIXMAN_OP_ADD */
  1.6156 +    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
  1.6157 +    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
  1.6158 +    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
  1.6159 +    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
  1.6160 +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
  1.6161 +    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
  1.6162 +    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
  1.6163 +    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
  1.6164 +    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
  1.6165 +    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
  1.6166 +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
  1.6167 +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
  1.6168 +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
  1.6169 +    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
  1.6170 +
  1.6171 +    /* PIXMAN_OP_SRC */
  1.6172 +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
  1.6173 +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
  1.6174 +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
  1.6175 +    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
  1.6176 +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
  1.6177 +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
  1.6178 +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
  1.6179 +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
  1.6180 +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
  1.6181 +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
  1.6182 +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
  1.6183 +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
  1.6184 +    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
  1.6185 +    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
  1.6186 +    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
  1.6187 +    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
  1.6188 +    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
  1.6189 +    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
  1.6190 +
  1.6191 +    /* PIXMAN_OP_IN */
  1.6192 +    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
  1.6193 +    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
  1.6194 +    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
  1.6195 +
  1.6196 +    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6197 +    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6198 +    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  1.6199 +    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  1.6200 +    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6201 +    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6202 +    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  1.6203 +    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  1.6204 +    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6205 +    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6206 +    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  1.6207 +    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  1.6208 +    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6209 +    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6210 +    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  1.6211 +    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  1.6212 +
  1.6213 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
  1.6214 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
  1.6215 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
  1.6216 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  1.6217 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
  1.6218 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
  1.6219 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
  1.6220 +    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  1.6221 +
  1.6222 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  1.6223 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6224 +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6225 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  1.6226 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6227 +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6228 +
  1.6229 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
  1.6230 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
  1.6231 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
  1.6232 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
  1.6233 +
  1.6234 +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
  1.6235 +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
  1.6236 +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
  1.6237 +    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
  1.6238 +
  1.6239 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
  1.6240 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
  1.6241 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
  1.6242 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
  1.6243 +
  1.6244 +    /* and here the needed entries are added to the fast path table */
  1.6245 +
  1.6246 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
  1.6247 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
  1.6248 +
  1.6249 +    { PIXMAN_OP_NONE },
  1.6250 +};
  1.6251 +
  1.6252 +static uint32_t *
  1.6253 +sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
  1.6254 +{
  1.6255 +    int w = iter->width;
  1.6256 +    __m128i ff000000 = mask_ff000000;
  1.6257 +    uint32_t *dst = iter->buffer;
  1.6258 +    uint32_t *src = (uint32_t *)iter->bits;
  1.6259 +
  1.6260 +    iter->bits += iter->stride;
  1.6261 +
  1.6262 +    while (w && ((uintptr_t)dst) & 0x0f)
  1.6263 +    {
  1.6264 +	*dst++ = (*src++) | 0xff000000;
  1.6265 +	w--;
  1.6266 +    }
  1.6267 +
  1.6268 +    while (w >= 4)
  1.6269 +    {
  1.6270 +	save_128_aligned (
  1.6271 +	    (__m128i *)dst, _mm_or_si128 (
  1.6272 +		load_128_unaligned ((__m128i *)src), ff000000));
  1.6273 +
  1.6274 +	dst += 4;
  1.6275 +	src += 4;
  1.6276 +	w -= 4;
  1.6277 +    }
  1.6278 +
  1.6279 +    while (w)
  1.6280 +    {
  1.6281 +	*dst++ = (*src++) | 0xff000000;
  1.6282 +	w--;
  1.6283 +    }
  1.6284 +
  1.6285 +    return iter->buffer;
  1.6286 +}
  1.6287 +
  1.6288 +static uint32_t *
  1.6289 +sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
  1.6290 +{
  1.6291 +    int w = iter->width;
  1.6292 +    uint32_t *dst = iter->buffer;
  1.6293 +    uint16_t *src = (uint16_t *)iter->bits;
  1.6294 +    __m128i ff000000 = mask_ff000000;
  1.6295 +
  1.6296 +    iter->bits += iter->stride;
  1.6297 +
  1.6298 +    while (w && ((uintptr_t)dst) & 0x0f)
  1.6299 +    {
  1.6300 +	uint16_t s = *src++;
  1.6301 +
  1.6302 +	*dst++ = convert_0565_to_8888 (s);
  1.6303 +	w--;
  1.6304 +    }
  1.6305 +
  1.6306 +    while (w >= 8)
  1.6307 +    {
  1.6308 +	__m128i lo, hi, s;
  1.6309 +
  1.6310 +	s = _mm_loadu_si128 ((__m128i *)src);
  1.6311 +
  1.6312 +	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
  1.6313 +	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
  1.6314 +
  1.6315 +	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
  1.6316 +	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
  1.6317 +
  1.6318 +	dst += 8;
  1.6319 +	src += 8;
  1.6320 +	w -= 8;
  1.6321 +    }
  1.6322 +
  1.6323 +    while (w)
  1.6324 +    {
  1.6325 +	uint16_t s = *src++;
  1.6326 +
  1.6327 +	*dst++ = convert_0565_to_8888 (s);
  1.6328 +	w--;
  1.6329 +    }
  1.6330 +
  1.6331 +    return iter->buffer;
  1.6332 +}
  1.6333 +
  1.6334 +static uint32_t *
  1.6335 +sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
  1.6336 +{
  1.6337 +    int w = iter->width;
  1.6338 +    uint32_t *dst = iter->buffer;
  1.6339 +    uint8_t *src = iter->bits;
  1.6340 +    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
  1.6341 +
  1.6342 +    iter->bits += iter->stride;
  1.6343 +
  1.6344 +    while (w && (((uintptr_t)dst) & 15))
  1.6345 +    {
  1.6346 +        *dst++ = *(src++) << 24;
  1.6347 +        w--;
  1.6348 +    }
  1.6349 +
  1.6350 +    while (w >= 16)
  1.6351 +    {
  1.6352 +	xmm0 = _mm_loadu_si128((__m128i *)src);
  1.6353 +
  1.6354 +	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
  1.6355 +	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
  1.6356 +	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
  1.6357 +	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
  1.6358 +	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
  1.6359 +	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
  1.6360 +
  1.6361 +	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
  1.6362 +	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
  1.6363 +	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
  1.6364 +	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
  1.6365 +
  1.6366 +	dst += 16;
  1.6367 +	src += 16;
  1.6368 +	w -= 16;
  1.6369 +    }
  1.6370 +
  1.6371 +    while (w)
  1.6372 +    {
  1.6373 +	*dst++ = *(src++) << 24;
  1.6374 +	w--;
  1.6375 +    }
  1.6376 +
  1.6377 +    return iter->buffer;
  1.6378 +}
  1.6379 +
  1.6380 +typedef struct
  1.6381 +{
  1.6382 +    pixman_format_code_t	format;
  1.6383 +    pixman_iter_get_scanline_t	get_scanline;
  1.6384 +} fetcher_info_t;
  1.6385 +
  1.6386 +static const fetcher_info_t fetchers[] =
  1.6387 +{
  1.6388 +    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
  1.6389 +    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
  1.6390 +    { PIXMAN_a8,		sse2_fetch_a8 },
  1.6391 +    { PIXMAN_null }
  1.6392 +};
  1.6393 +
  1.6394 +static pixman_bool_t
  1.6395 +sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
  1.6396 +{
  1.6397 +    pixman_image_t *image = iter->image;
  1.6398 +
  1.6399 +#define FLAGS								\
  1.6400 +    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
  1.6401 +     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
  1.6402 +
  1.6403 +    if ((iter->iter_flags & ITER_NARROW)			&&
  1.6404 +	(iter->image_flags & FLAGS) == FLAGS)
  1.6405 +    {
  1.6406 +	const fetcher_info_t *f;
  1.6407 +
  1.6408 +	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
  1.6409 +	{
  1.6410 +	    if (image->common.extended_format_code == f->format)
  1.6411 +	    {
  1.6412 +		uint8_t *b = (uint8_t *)image->bits.bits;
  1.6413 +		int s = image->bits.rowstride * 4;
  1.6414 +
  1.6415 +		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
  1.6416 +		iter->stride = s;
  1.6417 +
  1.6418 +		iter->get_scanline = f->get_scanline;
  1.6419 +		return TRUE;
  1.6420 +	    }
  1.6421 +	}
  1.6422 +    }
  1.6423 +
  1.6424 +    return FALSE;
  1.6425 +}
  1.6426 +
  1.6427 +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
  1.6428 +__attribute__((__force_align_arg_pointer__))
  1.6429 +#endif
  1.6430 +pixman_implementation_t *
  1.6431 +_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
  1.6432 +{
  1.6433 +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
  1.6434 +
  1.6435 +    /* SSE2 constants */
  1.6436 +    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
  1.6437 +    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
  1.6438 +    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
  1.6439 +    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
  1.6440 +    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
  1.6441 +    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
  1.6442 +    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
  1.6443 +    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
  1.6444 +    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
  1.6445 +    mask_0080 = create_mask_16_128 (0x0080);
  1.6446 +    mask_00ff = create_mask_16_128 (0x00ff);
  1.6447 +    mask_0101 = create_mask_16_128 (0x0101);
  1.6448 +    mask_ffff = create_mask_16_128 (0xffff);
  1.6449 +    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
  1.6450 +    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
  1.6451 +    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
  1.6452 +    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
  1.6453 +
  1.6454 +    /* Set up function pointers */
  1.6455 +    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
  1.6456 +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
  1.6457 +    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
  1.6458 +    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
  1.6459 +    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
  1.6460 +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
  1.6461 +    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
  1.6462 +    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
  1.6463 +    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
  1.6464 +    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
  1.6465 +
  1.6466 +    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
  1.6467 +
  1.6468 +    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
  1.6469 +    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
  1.6470 +    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
  1.6471 +    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
  1.6472 +    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
  1.6473 +    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
  1.6474 +    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
  1.6475 +    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
  1.6476 +    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
  1.6477 +    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
  1.6478 +    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
  1.6479 +
  1.6480 +    imp->blt = sse2_blt;
  1.6481 +    imp->fill = sse2_fill;
  1.6482 +
  1.6483 +    imp->src_iter_init = sse2_src_iter_init;
  1.6484 +
  1.6485 +    return imp;
  1.6486 +}

mercurial