1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/libpixman/src/pixman-sse2.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,6483 @@ 1.4 +/* 1.5 + * Copyright © 2008 Rodrigo Kumpera 1.6 + * Copyright © 2008 André Tupinambá 1.7 + * 1.8 + * Permission to use, copy, modify, distribute, and sell this software and its 1.9 + * documentation for any purpose is hereby granted without fee, provided that 1.10 + * the above copyright notice appear in all copies and that both that 1.11 + * copyright notice and this permission notice appear in supporting 1.12 + * documentation, and that the name of Red Hat not be used in advertising or 1.13 + * publicity pertaining to distribution of the software without specific, 1.14 + * written prior permission. Red Hat makes no representations about the 1.15 + * suitability of this software for any purpose. It is provided "as is" 1.16 + * without express or implied warranty. 1.17 + * 1.18 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 1.19 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 1.20 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 1.21 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1.22 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 1.23 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 1.24 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 1.25 + * SOFTWARE. 1.26 + * 1.27 + * Author: Rodrigo Kumpera (kumpera@gmail.com) 1.28 + * André Tupinambá (andrelrt@gmail.com) 1.29 + * 1.30 + * Based on work by Owen Taylor and Søren Sandmann 1.31 + */ 1.32 +#ifdef HAVE_CONFIG_H 1.33 +#include <config.h> 1.34 +#endif 1.35 + 1.36 +#include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ 1.37 +#include <emmintrin.h> /* for SSE2 intrinsics */ 1.38 +#include "pixman-private.h" 1.39 +#include "pixman-combine32.h" 1.40 +#include "pixman-inlines.h" 1.41 + 1.42 +static __m128i mask_0080; 1.43 +static __m128i mask_00ff; 1.44 +static __m128i mask_0101; 1.45 +static __m128i mask_ffff; 1.46 +static __m128i mask_ff000000; 1.47 +static __m128i mask_alpha; 1.48 + 1.49 +static __m128i mask_565_r; 1.50 +static __m128i mask_565_g1, mask_565_g2; 1.51 +static __m128i mask_565_b; 1.52 +static __m128i mask_red; 1.53 +static __m128i mask_green; 1.54 +static __m128i mask_blue; 1.55 + 1.56 +static __m128i mask_565_fix_rb; 1.57 +static __m128i mask_565_fix_g; 1.58 + 1.59 +static __m128i mask_565_rb; 1.60 +static __m128i mask_565_pack_multiplier; 1.61 + 1.62 +static force_inline __m128i 1.63 +unpack_32_1x128 (uint32_t data) 1.64 +{ 1.65 + return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); 1.66 +} 1.67 + 1.68 +static force_inline void 1.69 +unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) 1.70 +{ 1.71 + *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); 1.72 + *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); 1.73 +} 1.74 + 1.75 +static force_inline __m128i 1.76 +unpack_565_to_8888 (__m128i lo) 1.77 +{ 1.78 + __m128i r, g, b, rb, t; 1.79 + 1.80 + r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); 1.81 + g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); 1.82 + b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); 1.83 + 1.84 + rb = _mm_or_si128 (r, b); 1.85 + t = _mm_and_si128 (rb, mask_565_fix_rb); 1.86 + t = _mm_srli_epi32 (t, 5); 1.87 + rb = _mm_or_si128 (rb, t); 1.88 + 1.89 + t = _mm_and_si128 (g, mask_565_fix_g); 1.90 + t = _mm_srli_epi32 (t, 6); 1.91 + g = _mm_or_si128 (g, t); 1.92 + 1.93 + return _mm_or_si128 (rb, g); 1.94 +} 1.95 + 1.96 +static force_inline void 1.97 +unpack_565_128_4x128 (__m128i data, 1.98 + __m128i* data0, 1.99 + __m128i* data1, 1.100 + __m128i* data2, 1.101 + __m128i* data3) 1.102 +{ 1.103 + __m128i lo, hi; 1.104 + 1.105 + lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); 1.106 + hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); 1.107 + 1.108 + lo = unpack_565_to_8888 (lo); 1.109 + hi = unpack_565_to_8888 (hi); 1.110 + 1.111 + unpack_128_2x128 (lo, data0, data1); 1.112 + unpack_128_2x128 (hi, data2, data3); 1.113 +} 1.114 + 1.115 +static force_inline uint16_t 1.116 +pack_565_32_16 (uint32_t pixel) 1.117 +{ 1.118 + return (uint16_t) (((pixel >> 8) & 0xf800) | 1.119 + ((pixel >> 5) & 0x07e0) | 1.120 + ((pixel >> 3) & 0x001f)); 1.121 +} 1.122 + 1.123 +static force_inline __m128i 1.124 +pack_2x128_128 (__m128i lo, __m128i hi) 1.125 +{ 1.126 + return _mm_packus_epi16 (lo, hi); 1.127 +} 1.128 + 1.129 +static force_inline __m128i 1.130 +pack_565_2packedx128_128 (__m128i lo, __m128i hi) 1.131 +{ 1.132 + __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); 1.133 + __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); 1.134 + 1.135 + __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); 1.136 + __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); 1.137 + 1.138 + __m128i g0 = _mm_and_si128 (lo, mask_green); 1.139 + __m128i g1 = _mm_and_si128 (hi, mask_green); 1.140 + 1.141 + t0 = _mm_or_si128 (t0, g0); 1.142 + t1 = _mm_or_si128 (t1, g1); 1.143 + 1.144 + /* Simulates _mm_packus_epi32 */ 1.145 + t0 = _mm_slli_epi32 (t0, 16 - 5); 1.146 + t1 = _mm_slli_epi32 (t1, 16 - 5); 1.147 + t0 = _mm_srai_epi32 (t0, 16); 1.148 + t1 = _mm_srai_epi32 (t1, 16); 1.149 + return _mm_packs_epi32 (t0, t1); 1.150 +} 1.151 + 1.152 +static force_inline __m128i 1.153 +pack_565_2x128_128 (__m128i lo, __m128i hi) 1.154 +{ 1.155 + __m128i data; 1.156 + __m128i r, g1, g2, b; 1.157 + 1.158 + data = pack_2x128_128 (lo, hi); 1.159 + 1.160 + r = _mm_and_si128 (data, mask_565_r); 1.161 + g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); 1.162 + g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); 1.163 + b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); 1.164 + 1.165 + return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); 1.166 +} 1.167 + 1.168 +static force_inline __m128i 1.169 +pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) 1.170 +{ 1.171 + return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), 1.172 + pack_565_2x128_128 (*xmm2, *xmm3)); 1.173 +} 1.174 + 1.175 +static force_inline int 1.176 +is_opaque (__m128i x) 1.177 +{ 1.178 + __m128i ffs = _mm_cmpeq_epi8 (x, x); 1.179 + 1.180 + return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; 1.181 +} 1.182 + 1.183 +static force_inline int 1.184 +is_zero (__m128i x) 1.185 +{ 1.186 + return _mm_movemask_epi8 ( 1.187 + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; 1.188 +} 1.189 + 1.190 +static force_inline int 1.191 +is_transparent (__m128i x) 1.192 +{ 1.193 + return (_mm_movemask_epi8 ( 1.194 + _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; 1.195 +} 1.196 + 1.197 +static force_inline __m128i 1.198 +expand_pixel_32_1x128 (uint32_t data) 1.199 +{ 1.200 + return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); 1.201 +} 1.202 + 1.203 +static force_inline __m128i 1.204 +expand_alpha_1x128 (__m128i data) 1.205 +{ 1.206 + return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, 1.207 + _MM_SHUFFLE (3, 3, 3, 3)), 1.208 + _MM_SHUFFLE (3, 3, 3, 3)); 1.209 +} 1.210 + 1.211 +static force_inline void 1.212 +expand_alpha_2x128 (__m128i data_lo, 1.213 + __m128i data_hi, 1.214 + __m128i* alpha_lo, 1.215 + __m128i* alpha_hi) 1.216 +{ 1.217 + __m128i lo, hi; 1.218 + 1.219 + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); 1.220 + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); 1.221 + 1.222 + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); 1.223 + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); 1.224 +} 1.225 + 1.226 +static force_inline void 1.227 +expand_alpha_rev_2x128 (__m128i data_lo, 1.228 + __m128i data_hi, 1.229 + __m128i* alpha_lo, 1.230 + __m128i* alpha_hi) 1.231 +{ 1.232 + __m128i lo, hi; 1.233 + 1.234 + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); 1.235 + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); 1.236 + *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); 1.237 + *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); 1.238 +} 1.239 + 1.240 +static force_inline void 1.241 +pix_multiply_2x128 (__m128i* data_lo, 1.242 + __m128i* data_hi, 1.243 + __m128i* alpha_lo, 1.244 + __m128i* alpha_hi, 1.245 + __m128i* ret_lo, 1.246 + __m128i* ret_hi) 1.247 +{ 1.248 + __m128i lo, hi; 1.249 + 1.250 + lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); 1.251 + hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); 1.252 + lo = _mm_adds_epu16 (lo, mask_0080); 1.253 + hi = _mm_adds_epu16 (hi, mask_0080); 1.254 + *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); 1.255 + *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); 1.256 +} 1.257 + 1.258 +static force_inline void 1.259 +pix_add_multiply_2x128 (__m128i* src_lo, 1.260 + __m128i* src_hi, 1.261 + __m128i* alpha_dst_lo, 1.262 + __m128i* alpha_dst_hi, 1.263 + __m128i* dst_lo, 1.264 + __m128i* dst_hi, 1.265 + __m128i* alpha_src_lo, 1.266 + __m128i* alpha_src_hi, 1.267 + __m128i* ret_lo, 1.268 + __m128i* ret_hi) 1.269 +{ 1.270 + __m128i t1_lo, t1_hi; 1.271 + __m128i t2_lo, t2_hi; 1.272 + 1.273 + pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); 1.274 + pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); 1.275 + 1.276 + *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); 1.277 + *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); 1.278 +} 1.279 + 1.280 +static force_inline void 1.281 +negate_2x128 (__m128i data_lo, 1.282 + __m128i data_hi, 1.283 + __m128i* neg_lo, 1.284 + __m128i* neg_hi) 1.285 +{ 1.286 + *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); 1.287 + *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); 1.288 +} 1.289 + 1.290 +static force_inline void 1.291 +invert_colors_2x128 (__m128i data_lo, 1.292 + __m128i data_hi, 1.293 + __m128i* inv_lo, 1.294 + __m128i* inv_hi) 1.295 +{ 1.296 + __m128i lo, hi; 1.297 + 1.298 + lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); 1.299 + hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); 1.300 + *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); 1.301 + *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); 1.302 +} 1.303 + 1.304 +static force_inline void 1.305 +over_2x128 (__m128i* src_lo, 1.306 + __m128i* src_hi, 1.307 + __m128i* alpha_lo, 1.308 + __m128i* alpha_hi, 1.309 + __m128i* dst_lo, 1.310 + __m128i* dst_hi) 1.311 +{ 1.312 + __m128i t1, t2; 1.313 + 1.314 + negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); 1.315 + 1.316 + pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); 1.317 + 1.318 + *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); 1.319 + *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); 1.320 +} 1.321 + 1.322 +static force_inline void 1.323 +over_rev_non_pre_2x128 (__m128i src_lo, 1.324 + __m128i src_hi, 1.325 + __m128i* dst_lo, 1.326 + __m128i* dst_hi) 1.327 +{ 1.328 + __m128i lo, hi; 1.329 + __m128i alpha_lo, alpha_hi; 1.330 + 1.331 + expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); 1.332 + 1.333 + lo = _mm_or_si128 (alpha_lo, mask_alpha); 1.334 + hi = _mm_or_si128 (alpha_hi, mask_alpha); 1.335 + 1.336 + invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); 1.337 + 1.338 + pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); 1.339 + 1.340 + over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); 1.341 +} 1.342 + 1.343 +static force_inline void 1.344 +in_over_2x128 (__m128i* src_lo, 1.345 + __m128i* src_hi, 1.346 + __m128i* alpha_lo, 1.347 + __m128i* alpha_hi, 1.348 + __m128i* mask_lo, 1.349 + __m128i* mask_hi, 1.350 + __m128i* dst_lo, 1.351 + __m128i* dst_hi) 1.352 +{ 1.353 + __m128i s_lo, s_hi; 1.354 + __m128i a_lo, a_hi; 1.355 + 1.356 + pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); 1.357 + pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); 1.358 + 1.359 + over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); 1.360 +} 1.361 + 1.362 +/* load 4 pixels from a 16-byte boundary aligned address */ 1.363 +static force_inline __m128i 1.364 +load_128_aligned (__m128i* src) 1.365 +{ 1.366 + return _mm_load_si128 (src); 1.367 +} 1.368 + 1.369 +/* load 4 pixels from a unaligned address */ 1.370 +static force_inline __m128i 1.371 +load_128_unaligned (const __m128i* src) 1.372 +{ 1.373 + return _mm_loadu_si128 (src); 1.374 +} 1.375 + 1.376 +/* save 4 pixels using Write Combining memory on a 16-byte 1.377 + * boundary aligned address 1.378 + */ 1.379 +static force_inline void 1.380 +save_128_write_combining (__m128i* dst, 1.381 + __m128i data) 1.382 +{ 1.383 + _mm_stream_si128 (dst, data); 1.384 +} 1.385 + 1.386 +/* save 4 pixels on a 16-byte boundary aligned address */ 1.387 +static force_inline void 1.388 +save_128_aligned (__m128i* dst, 1.389 + __m128i data) 1.390 +{ 1.391 + _mm_store_si128 (dst, data); 1.392 +} 1.393 + 1.394 +/* save 4 pixels on a unaligned address */ 1.395 +static force_inline void 1.396 +save_128_unaligned (__m128i* dst, 1.397 + __m128i data) 1.398 +{ 1.399 + _mm_storeu_si128 (dst, data); 1.400 +} 1.401 + 1.402 +static force_inline __m128i 1.403 +load_32_1x128 (uint32_t data) 1.404 +{ 1.405 + return _mm_cvtsi32_si128 (data); 1.406 +} 1.407 + 1.408 +static force_inline __m128i 1.409 +expand_alpha_rev_1x128 (__m128i data) 1.410 +{ 1.411 + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); 1.412 +} 1.413 + 1.414 +static force_inline __m128i 1.415 +expand_pixel_8_1x128 (uint8_t data) 1.416 +{ 1.417 + return _mm_shufflelo_epi16 ( 1.418 + unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); 1.419 +} 1.420 + 1.421 +static force_inline __m128i 1.422 +pix_multiply_1x128 (__m128i data, 1.423 + __m128i alpha) 1.424 +{ 1.425 + return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), 1.426 + mask_0080), 1.427 + mask_0101); 1.428 +} 1.429 + 1.430 +static force_inline __m128i 1.431 +pix_add_multiply_1x128 (__m128i* src, 1.432 + __m128i* alpha_dst, 1.433 + __m128i* dst, 1.434 + __m128i* alpha_src) 1.435 +{ 1.436 + __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); 1.437 + __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); 1.438 + 1.439 + return _mm_adds_epu8 (t1, t2); 1.440 +} 1.441 + 1.442 +static force_inline __m128i 1.443 +negate_1x128 (__m128i data) 1.444 +{ 1.445 + return _mm_xor_si128 (data, mask_00ff); 1.446 +} 1.447 + 1.448 +static force_inline __m128i 1.449 +invert_colors_1x128 (__m128i data) 1.450 +{ 1.451 + return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); 1.452 +} 1.453 + 1.454 +static force_inline __m128i 1.455 +over_1x128 (__m128i src, __m128i alpha, __m128i dst) 1.456 +{ 1.457 + return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); 1.458 +} 1.459 + 1.460 +static force_inline __m128i 1.461 +in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) 1.462 +{ 1.463 + return over_1x128 (pix_multiply_1x128 (*src, *mask), 1.464 + pix_multiply_1x128 (*alpha, *mask), 1.465 + *dst); 1.466 +} 1.467 + 1.468 +static force_inline __m128i 1.469 +over_rev_non_pre_1x128 (__m128i src, __m128i dst) 1.470 +{ 1.471 + __m128i alpha = expand_alpha_1x128 (src); 1.472 + 1.473 + return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), 1.474 + _mm_or_si128 (alpha, mask_alpha)), 1.475 + alpha, 1.476 + dst); 1.477 +} 1.478 + 1.479 +static force_inline uint32_t 1.480 +pack_1x128_32 (__m128i data) 1.481 +{ 1.482 + return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); 1.483 +} 1.484 + 1.485 +static force_inline __m128i 1.486 +expand565_16_1x128 (uint16_t pixel) 1.487 +{ 1.488 + __m128i m = _mm_cvtsi32_si128 (pixel); 1.489 + 1.490 + m = unpack_565_to_8888 (m); 1.491 + 1.492 + return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); 1.493 +} 1.494 + 1.495 +static force_inline uint32_t 1.496 +core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) 1.497 +{ 1.498 + uint8_t a; 1.499 + __m128i xmms; 1.500 + 1.501 + a = src >> 24; 1.502 + 1.503 + if (a == 0xff) 1.504 + { 1.505 + return src; 1.506 + } 1.507 + else if (src) 1.508 + { 1.509 + xmms = unpack_32_1x128 (src); 1.510 + return pack_1x128_32 ( 1.511 + over_1x128 (xmms, expand_alpha_1x128 (xmms), 1.512 + unpack_32_1x128 (dst))); 1.513 + } 1.514 + 1.515 + return dst; 1.516 +} 1.517 + 1.518 +static force_inline uint32_t 1.519 +combine1 (const uint32_t *ps, const uint32_t *pm) 1.520 +{ 1.521 + uint32_t s = *ps; 1.522 + 1.523 + if (pm) 1.524 + { 1.525 + __m128i ms, mm; 1.526 + 1.527 + mm = unpack_32_1x128 (*pm); 1.528 + mm = expand_alpha_1x128 (mm); 1.529 + 1.530 + ms = unpack_32_1x128 (s); 1.531 + ms = pix_multiply_1x128 (ms, mm); 1.532 + 1.533 + s = pack_1x128_32 (ms); 1.534 + } 1.535 + 1.536 + return s; 1.537 +} 1.538 + 1.539 +static force_inline __m128i 1.540 +combine4 (const __m128i *ps, const __m128i *pm) 1.541 +{ 1.542 + __m128i xmm_src_lo, xmm_src_hi; 1.543 + __m128i xmm_msk_lo, xmm_msk_hi; 1.544 + __m128i s; 1.545 + 1.546 + if (pm) 1.547 + { 1.548 + xmm_msk_lo = load_128_unaligned (pm); 1.549 + 1.550 + if (is_transparent (xmm_msk_lo)) 1.551 + return _mm_setzero_si128 (); 1.552 + } 1.553 + 1.554 + s = load_128_unaligned (ps); 1.555 + 1.556 + if (pm) 1.557 + { 1.558 + unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); 1.559 + unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); 1.560 + 1.561 + expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); 1.562 + 1.563 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.564 + &xmm_msk_lo, &xmm_msk_hi, 1.565 + &xmm_src_lo, &xmm_src_hi); 1.566 + 1.567 + s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); 1.568 + } 1.569 + 1.570 + return s; 1.571 +} 1.572 + 1.573 +static force_inline void 1.574 +core_combine_over_u_sse2_mask (uint32_t * pd, 1.575 + const uint32_t* ps, 1.576 + const uint32_t* pm, 1.577 + int w) 1.578 +{ 1.579 + uint32_t s, d; 1.580 + 1.581 + /* Align dst on a 16-byte boundary */ 1.582 + while (w && ((uintptr_t)pd & 15)) 1.583 + { 1.584 + d = *pd; 1.585 + s = combine1 (ps, pm); 1.586 + 1.587 + if (s) 1.588 + *pd = core_combine_over_u_pixel_sse2 (s, d); 1.589 + pd++; 1.590 + ps++; 1.591 + pm++; 1.592 + w--; 1.593 + } 1.594 + 1.595 + while (w >= 4) 1.596 + { 1.597 + __m128i mask = load_128_unaligned ((__m128i *)pm); 1.598 + 1.599 + if (!is_zero (mask)) 1.600 + { 1.601 + __m128i src; 1.602 + __m128i src_hi, src_lo; 1.603 + __m128i mask_hi, mask_lo; 1.604 + __m128i alpha_hi, alpha_lo; 1.605 + 1.606 + src = load_128_unaligned ((__m128i *)ps); 1.607 + 1.608 + if (is_opaque (_mm_and_si128 (src, mask))) 1.609 + { 1.610 + save_128_aligned ((__m128i *)pd, src); 1.611 + } 1.612 + else 1.613 + { 1.614 + __m128i dst = load_128_aligned ((__m128i *)pd); 1.615 + __m128i dst_hi, dst_lo; 1.616 + 1.617 + unpack_128_2x128 (mask, &mask_lo, &mask_hi); 1.618 + unpack_128_2x128 (src, &src_lo, &src_hi); 1.619 + 1.620 + expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); 1.621 + pix_multiply_2x128 (&src_lo, &src_hi, 1.622 + &mask_lo, &mask_hi, 1.623 + &src_lo, &src_hi); 1.624 + 1.625 + unpack_128_2x128 (dst, &dst_lo, &dst_hi); 1.626 + 1.627 + expand_alpha_2x128 (src_lo, src_hi, 1.628 + &alpha_lo, &alpha_hi); 1.629 + 1.630 + over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, 1.631 + &dst_lo, &dst_hi); 1.632 + 1.633 + save_128_aligned ( 1.634 + (__m128i *)pd, 1.635 + pack_2x128_128 (dst_lo, dst_hi)); 1.636 + } 1.637 + } 1.638 + 1.639 + pm += 4; 1.640 + ps += 4; 1.641 + pd += 4; 1.642 + w -= 4; 1.643 + } 1.644 + while (w) 1.645 + { 1.646 + d = *pd; 1.647 + s = combine1 (ps, pm); 1.648 + 1.649 + if (s) 1.650 + *pd = core_combine_over_u_pixel_sse2 (s, d); 1.651 + pd++; 1.652 + ps++; 1.653 + pm++; 1.654 + 1.655 + w--; 1.656 + } 1.657 +} 1.658 + 1.659 +static force_inline void 1.660 +core_combine_over_u_sse2_no_mask (uint32_t * pd, 1.661 + const uint32_t* ps, 1.662 + int w) 1.663 +{ 1.664 + uint32_t s, d; 1.665 + 1.666 + /* Align dst on a 16-byte boundary */ 1.667 + while (w && ((uintptr_t)pd & 15)) 1.668 + { 1.669 + d = *pd; 1.670 + s = *ps; 1.671 + 1.672 + if (s) 1.673 + *pd = core_combine_over_u_pixel_sse2 (s, d); 1.674 + pd++; 1.675 + ps++; 1.676 + w--; 1.677 + } 1.678 + 1.679 + while (w >= 4) 1.680 + { 1.681 + __m128i src; 1.682 + __m128i src_hi, src_lo, dst_hi, dst_lo; 1.683 + __m128i alpha_hi, alpha_lo; 1.684 + 1.685 + src = load_128_unaligned ((__m128i *)ps); 1.686 + 1.687 + if (!is_zero (src)) 1.688 + { 1.689 + if (is_opaque (src)) 1.690 + { 1.691 + save_128_aligned ((__m128i *)pd, src); 1.692 + } 1.693 + else 1.694 + { 1.695 + __m128i dst = load_128_aligned ((__m128i *)pd); 1.696 + 1.697 + unpack_128_2x128 (src, &src_lo, &src_hi); 1.698 + unpack_128_2x128 (dst, &dst_lo, &dst_hi); 1.699 + 1.700 + expand_alpha_2x128 (src_lo, src_hi, 1.701 + &alpha_lo, &alpha_hi); 1.702 + over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, 1.703 + &dst_lo, &dst_hi); 1.704 + 1.705 + save_128_aligned ( 1.706 + (__m128i *)pd, 1.707 + pack_2x128_128 (dst_lo, dst_hi)); 1.708 + } 1.709 + } 1.710 + 1.711 + ps += 4; 1.712 + pd += 4; 1.713 + w -= 4; 1.714 + } 1.715 + while (w) 1.716 + { 1.717 + d = *pd; 1.718 + s = *ps; 1.719 + 1.720 + if (s) 1.721 + *pd = core_combine_over_u_pixel_sse2 (s, d); 1.722 + pd++; 1.723 + ps++; 1.724 + 1.725 + w--; 1.726 + } 1.727 +} 1.728 + 1.729 +static force_inline void 1.730 +sse2_combine_over_u (pixman_implementation_t *imp, 1.731 + pixman_op_t op, 1.732 + uint32_t * pd, 1.733 + const uint32_t * ps, 1.734 + const uint32_t * pm, 1.735 + int w) 1.736 +{ 1.737 + if (pm) 1.738 + core_combine_over_u_sse2_mask (pd, ps, pm, w); 1.739 + else 1.740 + core_combine_over_u_sse2_no_mask (pd, ps, w); 1.741 +} 1.742 + 1.743 +static void 1.744 +sse2_combine_over_reverse_u (pixman_implementation_t *imp, 1.745 + pixman_op_t op, 1.746 + uint32_t * pd, 1.747 + const uint32_t * ps, 1.748 + const uint32_t * pm, 1.749 + int w) 1.750 +{ 1.751 + uint32_t s, d; 1.752 + 1.753 + __m128i xmm_dst_lo, xmm_dst_hi; 1.754 + __m128i xmm_src_lo, xmm_src_hi; 1.755 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.756 + 1.757 + /* Align dst on a 16-byte boundary */ 1.758 + while (w && 1.759 + ((uintptr_t)pd & 15)) 1.760 + { 1.761 + d = *pd; 1.762 + s = combine1 (ps, pm); 1.763 + 1.764 + *pd++ = core_combine_over_u_pixel_sse2 (d, s); 1.765 + w--; 1.766 + ps++; 1.767 + if (pm) 1.768 + pm++; 1.769 + } 1.770 + 1.771 + while (w >= 4) 1.772 + { 1.773 + /* I'm loading unaligned because I'm not sure 1.774 + * about the address alignment. 1.775 + */ 1.776 + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1.777 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.778 + 1.779 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.780 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.781 + 1.782 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.783 + &xmm_alpha_lo, &xmm_alpha_hi); 1.784 + 1.785 + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.786 + &xmm_alpha_lo, &xmm_alpha_hi, 1.787 + &xmm_src_lo, &xmm_src_hi); 1.788 + 1.789 + /* rebuid the 4 pixel data and save*/ 1.790 + save_128_aligned ((__m128i*)pd, 1.791 + pack_2x128_128 (xmm_src_lo, xmm_src_hi)); 1.792 + 1.793 + w -= 4; 1.794 + ps += 4; 1.795 + pd += 4; 1.796 + 1.797 + if (pm) 1.798 + pm += 4; 1.799 + } 1.800 + 1.801 + while (w) 1.802 + { 1.803 + d = *pd; 1.804 + s = combine1 (ps, pm); 1.805 + 1.806 + *pd++ = core_combine_over_u_pixel_sse2 (d, s); 1.807 + ps++; 1.808 + w--; 1.809 + if (pm) 1.810 + pm++; 1.811 + } 1.812 +} 1.813 + 1.814 +static force_inline uint32_t 1.815 +core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) 1.816 +{ 1.817 + uint32_t maska = src >> 24; 1.818 + 1.819 + if (maska == 0) 1.820 + { 1.821 + return 0; 1.822 + } 1.823 + else if (maska != 0xff) 1.824 + { 1.825 + return pack_1x128_32 ( 1.826 + pix_multiply_1x128 (unpack_32_1x128 (dst), 1.827 + expand_alpha_1x128 (unpack_32_1x128 (src)))); 1.828 + } 1.829 + 1.830 + return dst; 1.831 +} 1.832 + 1.833 +static void 1.834 +sse2_combine_in_u (pixman_implementation_t *imp, 1.835 + pixman_op_t op, 1.836 + uint32_t * pd, 1.837 + const uint32_t * ps, 1.838 + const uint32_t * pm, 1.839 + int w) 1.840 +{ 1.841 + uint32_t s, d; 1.842 + 1.843 + __m128i xmm_src_lo, xmm_src_hi; 1.844 + __m128i xmm_dst_lo, xmm_dst_hi; 1.845 + 1.846 + while (w && ((uintptr_t)pd & 15)) 1.847 + { 1.848 + s = combine1 (ps, pm); 1.849 + d = *pd; 1.850 + 1.851 + *pd++ = core_combine_in_u_pixel_sse2 (d, s); 1.852 + w--; 1.853 + ps++; 1.854 + if (pm) 1.855 + pm++; 1.856 + } 1.857 + 1.858 + while (w >= 4) 1.859 + { 1.860 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.861 + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); 1.862 + 1.863 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.864 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.865 + 1.866 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.867 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.868 + &xmm_dst_lo, &xmm_dst_hi, 1.869 + &xmm_dst_lo, &xmm_dst_hi); 1.870 + 1.871 + save_128_aligned ((__m128i*)pd, 1.872 + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.873 + 1.874 + ps += 4; 1.875 + pd += 4; 1.876 + w -= 4; 1.877 + if (pm) 1.878 + pm += 4; 1.879 + } 1.880 + 1.881 + while (w) 1.882 + { 1.883 + s = combine1 (ps, pm); 1.884 + d = *pd; 1.885 + 1.886 + *pd++ = core_combine_in_u_pixel_sse2 (d, s); 1.887 + w--; 1.888 + ps++; 1.889 + if (pm) 1.890 + pm++; 1.891 + } 1.892 +} 1.893 + 1.894 +static void 1.895 +sse2_combine_in_reverse_u (pixman_implementation_t *imp, 1.896 + pixman_op_t op, 1.897 + uint32_t * pd, 1.898 + const uint32_t * ps, 1.899 + const uint32_t * pm, 1.900 + int w) 1.901 +{ 1.902 + uint32_t s, d; 1.903 + 1.904 + __m128i xmm_src_lo, xmm_src_hi; 1.905 + __m128i xmm_dst_lo, xmm_dst_hi; 1.906 + 1.907 + while (w && ((uintptr_t)pd & 15)) 1.908 + { 1.909 + s = combine1 (ps, pm); 1.910 + d = *pd; 1.911 + 1.912 + *pd++ = core_combine_in_u_pixel_sse2 (s, d); 1.913 + ps++; 1.914 + w--; 1.915 + if (pm) 1.916 + pm++; 1.917 + } 1.918 + 1.919 + while (w >= 4) 1.920 + { 1.921 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.922 + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); 1.923 + 1.924 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.925 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.926 + 1.927 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.928 + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.929 + &xmm_src_lo, &xmm_src_hi, 1.930 + &xmm_dst_lo, &xmm_dst_hi); 1.931 + 1.932 + save_128_aligned ( 1.933 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.934 + 1.935 + ps += 4; 1.936 + pd += 4; 1.937 + w -= 4; 1.938 + if (pm) 1.939 + pm += 4; 1.940 + } 1.941 + 1.942 + while (w) 1.943 + { 1.944 + s = combine1 (ps, pm); 1.945 + d = *pd; 1.946 + 1.947 + *pd++ = core_combine_in_u_pixel_sse2 (s, d); 1.948 + w--; 1.949 + ps++; 1.950 + if (pm) 1.951 + pm++; 1.952 + } 1.953 +} 1.954 + 1.955 +static void 1.956 +sse2_combine_out_reverse_u (pixman_implementation_t *imp, 1.957 + pixman_op_t op, 1.958 + uint32_t * pd, 1.959 + const uint32_t * ps, 1.960 + const uint32_t * pm, 1.961 + int w) 1.962 +{ 1.963 + while (w && ((uintptr_t)pd & 15)) 1.964 + { 1.965 + uint32_t s = combine1 (ps, pm); 1.966 + uint32_t d = *pd; 1.967 + 1.968 + *pd++ = pack_1x128_32 ( 1.969 + pix_multiply_1x128 ( 1.970 + unpack_32_1x128 (d), negate_1x128 ( 1.971 + expand_alpha_1x128 (unpack_32_1x128 (s))))); 1.972 + 1.973 + if (pm) 1.974 + pm++; 1.975 + ps++; 1.976 + w--; 1.977 + } 1.978 + 1.979 + while (w >= 4) 1.980 + { 1.981 + __m128i xmm_src_lo, xmm_src_hi; 1.982 + __m128i xmm_dst_lo, xmm_dst_hi; 1.983 + 1.984 + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1.985 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.986 + 1.987 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.988 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.989 + 1.990 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.991 + negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.992 + 1.993 + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.994 + &xmm_src_lo, &xmm_src_hi, 1.995 + &xmm_dst_lo, &xmm_dst_hi); 1.996 + 1.997 + save_128_aligned ( 1.998 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.999 + 1.1000 + ps += 4; 1.1001 + pd += 4; 1.1002 + if (pm) 1.1003 + pm += 4; 1.1004 + 1.1005 + w -= 4; 1.1006 + } 1.1007 + 1.1008 + while (w) 1.1009 + { 1.1010 + uint32_t s = combine1 (ps, pm); 1.1011 + uint32_t d = *pd; 1.1012 + 1.1013 + *pd++ = pack_1x128_32 ( 1.1014 + pix_multiply_1x128 ( 1.1015 + unpack_32_1x128 (d), negate_1x128 ( 1.1016 + expand_alpha_1x128 (unpack_32_1x128 (s))))); 1.1017 + ps++; 1.1018 + if (pm) 1.1019 + pm++; 1.1020 + w--; 1.1021 + } 1.1022 +} 1.1023 + 1.1024 +static void 1.1025 +sse2_combine_out_u (pixman_implementation_t *imp, 1.1026 + pixman_op_t op, 1.1027 + uint32_t * pd, 1.1028 + const uint32_t * ps, 1.1029 + const uint32_t * pm, 1.1030 + int w) 1.1031 +{ 1.1032 + while (w && ((uintptr_t)pd & 15)) 1.1033 + { 1.1034 + uint32_t s = combine1 (ps, pm); 1.1035 + uint32_t d = *pd; 1.1036 + 1.1037 + *pd++ = pack_1x128_32 ( 1.1038 + pix_multiply_1x128 ( 1.1039 + unpack_32_1x128 (s), negate_1x128 ( 1.1040 + expand_alpha_1x128 (unpack_32_1x128 (d))))); 1.1041 + w--; 1.1042 + ps++; 1.1043 + if (pm) 1.1044 + pm++; 1.1045 + } 1.1046 + 1.1047 + while (w >= 4) 1.1048 + { 1.1049 + __m128i xmm_src_lo, xmm_src_hi; 1.1050 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1051 + 1.1052 + xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); 1.1053 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.1054 + 1.1055 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1056 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1057 + 1.1058 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1059 + negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1060 + 1.1061 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.1062 + &xmm_dst_lo, &xmm_dst_hi, 1.1063 + &xmm_dst_lo, &xmm_dst_hi); 1.1064 + 1.1065 + save_128_aligned ( 1.1066 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1067 + 1.1068 + ps += 4; 1.1069 + pd += 4; 1.1070 + w -= 4; 1.1071 + if (pm) 1.1072 + pm += 4; 1.1073 + } 1.1074 + 1.1075 + while (w) 1.1076 + { 1.1077 + uint32_t s = combine1 (ps, pm); 1.1078 + uint32_t d = *pd; 1.1079 + 1.1080 + *pd++ = pack_1x128_32 ( 1.1081 + pix_multiply_1x128 ( 1.1082 + unpack_32_1x128 (s), negate_1x128 ( 1.1083 + expand_alpha_1x128 (unpack_32_1x128 (d))))); 1.1084 + w--; 1.1085 + ps++; 1.1086 + if (pm) 1.1087 + pm++; 1.1088 + } 1.1089 +} 1.1090 + 1.1091 +static force_inline uint32_t 1.1092 +core_combine_atop_u_pixel_sse2 (uint32_t src, 1.1093 + uint32_t dst) 1.1094 +{ 1.1095 + __m128i s = unpack_32_1x128 (src); 1.1096 + __m128i d = unpack_32_1x128 (dst); 1.1097 + 1.1098 + __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); 1.1099 + __m128i da = expand_alpha_1x128 (d); 1.1100 + 1.1101 + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); 1.1102 +} 1.1103 + 1.1104 +static void 1.1105 +sse2_combine_atop_u (pixman_implementation_t *imp, 1.1106 + pixman_op_t op, 1.1107 + uint32_t * pd, 1.1108 + const uint32_t * ps, 1.1109 + const uint32_t * pm, 1.1110 + int w) 1.1111 +{ 1.1112 + uint32_t s, d; 1.1113 + 1.1114 + __m128i xmm_src_lo, xmm_src_hi; 1.1115 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1116 + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1.1117 + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1.1118 + 1.1119 + while (w && ((uintptr_t)pd & 15)) 1.1120 + { 1.1121 + s = combine1 (ps, pm); 1.1122 + d = *pd; 1.1123 + 1.1124 + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); 1.1125 + w--; 1.1126 + ps++; 1.1127 + if (pm) 1.1128 + pm++; 1.1129 + } 1.1130 + 1.1131 + while (w >= 4) 1.1132 + { 1.1133 + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1.1134 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.1135 + 1.1136 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1137 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1138 + 1.1139 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.1140 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.1141 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.1142 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.1143 + 1.1144 + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, 1.1145 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.1146 + 1.1147 + pix_add_multiply_2x128 ( 1.1148 + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1.1149 + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1.1150 + &xmm_dst_lo, &xmm_dst_hi); 1.1151 + 1.1152 + save_128_aligned ( 1.1153 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1154 + 1.1155 + ps += 4; 1.1156 + pd += 4; 1.1157 + w -= 4; 1.1158 + if (pm) 1.1159 + pm += 4; 1.1160 + } 1.1161 + 1.1162 + while (w) 1.1163 + { 1.1164 + s = combine1 (ps, pm); 1.1165 + d = *pd; 1.1166 + 1.1167 + *pd++ = core_combine_atop_u_pixel_sse2 (s, d); 1.1168 + w--; 1.1169 + ps++; 1.1170 + if (pm) 1.1171 + pm++; 1.1172 + } 1.1173 +} 1.1174 + 1.1175 +static force_inline uint32_t 1.1176 +core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, 1.1177 + uint32_t dst) 1.1178 +{ 1.1179 + __m128i s = unpack_32_1x128 (src); 1.1180 + __m128i d = unpack_32_1x128 (dst); 1.1181 + 1.1182 + __m128i sa = expand_alpha_1x128 (s); 1.1183 + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); 1.1184 + 1.1185 + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); 1.1186 +} 1.1187 + 1.1188 +static void 1.1189 +sse2_combine_atop_reverse_u (pixman_implementation_t *imp, 1.1190 + pixman_op_t op, 1.1191 + uint32_t * pd, 1.1192 + const uint32_t * ps, 1.1193 + const uint32_t * pm, 1.1194 + int w) 1.1195 +{ 1.1196 + uint32_t s, d; 1.1197 + 1.1198 + __m128i xmm_src_lo, xmm_src_hi; 1.1199 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1200 + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1.1201 + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1.1202 + 1.1203 + while (w && ((uintptr_t)pd & 15)) 1.1204 + { 1.1205 + s = combine1 (ps, pm); 1.1206 + d = *pd; 1.1207 + 1.1208 + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); 1.1209 + ps++; 1.1210 + w--; 1.1211 + if (pm) 1.1212 + pm++; 1.1213 + } 1.1214 + 1.1215 + while (w >= 4) 1.1216 + { 1.1217 + xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); 1.1218 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.1219 + 1.1220 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1221 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1222 + 1.1223 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.1224 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.1225 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.1226 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.1227 + 1.1228 + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1.1229 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.1230 + 1.1231 + pix_add_multiply_2x128 ( 1.1232 + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1.1233 + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1.1234 + &xmm_dst_lo, &xmm_dst_hi); 1.1235 + 1.1236 + save_128_aligned ( 1.1237 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1238 + 1.1239 + ps += 4; 1.1240 + pd += 4; 1.1241 + w -= 4; 1.1242 + if (pm) 1.1243 + pm += 4; 1.1244 + } 1.1245 + 1.1246 + while (w) 1.1247 + { 1.1248 + s = combine1 (ps, pm); 1.1249 + d = *pd; 1.1250 + 1.1251 + *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); 1.1252 + ps++; 1.1253 + w--; 1.1254 + if (pm) 1.1255 + pm++; 1.1256 + } 1.1257 +} 1.1258 + 1.1259 +static force_inline uint32_t 1.1260 +core_combine_xor_u_pixel_sse2 (uint32_t src, 1.1261 + uint32_t dst) 1.1262 +{ 1.1263 + __m128i s = unpack_32_1x128 (src); 1.1264 + __m128i d = unpack_32_1x128 (dst); 1.1265 + 1.1266 + __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); 1.1267 + __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); 1.1268 + 1.1269 + return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); 1.1270 +} 1.1271 + 1.1272 +static void 1.1273 +sse2_combine_xor_u (pixman_implementation_t *imp, 1.1274 + pixman_op_t op, 1.1275 + uint32_t * dst, 1.1276 + const uint32_t * src, 1.1277 + const uint32_t * mask, 1.1278 + int width) 1.1279 +{ 1.1280 + int w = width; 1.1281 + uint32_t s, d; 1.1282 + uint32_t* pd = dst; 1.1283 + const uint32_t* ps = src; 1.1284 + const uint32_t* pm = mask; 1.1285 + 1.1286 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.1287 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.1288 + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1.1289 + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1.1290 + 1.1291 + while (w && ((uintptr_t)pd & 15)) 1.1292 + { 1.1293 + s = combine1 (ps, pm); 1.1294 + d = *pd; 1.1295 + 1.1296 + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); 1.1297 + w--; 1.1298 + ps++; 1.1299 + if (pm) 1.1300 + pm++; 1.1301 + } 1.1302 + 1.1303 + while (w >= 4) 1.1304 + { 1.1305 + xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); 1.1306 + xmm_dst = load_128_aligned ((__m128i*) pd); 1.1307 + 1.1308 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.1309 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.1310 + 1.1311 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.1312 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.1313 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.1314 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.1315 + 1.1316 + negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, 1.1317 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.1318 + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1.1319 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.1320 + 1.1321 + pix_add_multiply_2x128 ( 1.1322 + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1.1323 + &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1.1324 + &xmm_dst_lo, &xmm_dst_hi); 1.1325 + 1.1326 + save_128_aligned ( 1.1327 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1328 + 1.1329 + ps += 4; 1.1330 + pd += 4; 1.1331 + w -= 4; 1.1332 + if (pm) 1.1333 + pm += 4; 1.1334 + } 1.1335 + 1.1336 + while (w) 1.1337 + { 1.1338 + s = combine1 (ps, pm); 1.1339 + d = *pd; 1.1340 + 1.1341 + *pd++ = core_combine_xor_u_pixel_sse2 (s, d); 1.1342 + w--; 1.1343 + ps++; 1.1344 + if (pm) 1.1345 + pm++; 1.1346 + } 1.1347 +} 1.1348 + 1.1349 +static force_inline void 1.1350 +sse2_combine_add_u (pixman_implementation_t *imp, 1.1351 + pixman_op_t op, 1.1352 + uint32_t * dst, 1.1353 + const uint32_t * src, 1.1354 + const uint32_t * mask, 1.1355 + int width) 1.1356 +{ 1.1357 + int w = width; 1.1358 + uint32_t s, d; 1.1359 + uint32_t* pd = dst; 1.1360 + const uint32_t* ps = src; 1.1361 + const uint32_t* pm = mask; 1.1362 + 1.1363 + while (w && (uintptr_t)pd & 15) 1.1364 + { 1.1365 + s = combine1 (ps, pm); 1.1366 + d = *pd; 1.1367 + 1.1368 + ps++; 1.1369 + if (pm) 1.1370 + pm++; 1.1371 + *pd++ = _mm_cvtsi128_si32 ( 1.1372 + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); 1.1373 + w--; 1.1374 + } 1.1375 + 1.1376 + while (w >= 4) 1.1377 + { 1.1378 + __m128i s; 1.1379 + 1.1380 + s = combine4 ((__m128i*)ps, (__m128i*)pm); 1.1381 + 1.1382 + save_128_aligned ( 1.1383 + (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); 1.1384 + 1.1385 + pd += 4; 1.1386 + ps += 4; 1.1387 + if (pm) 1.1388 + pm += 4; 1.1389 + w -= 4; 1.1390 + } 1.1391 + 1.1392 + while (w--) 1.1393 + { 1.1394 + s = combine1 (ps, pm); 1.1395 + d = *pd; 1.1396 + 1.1397 + ps++; 1.1398 + *pd++ = _mm_cvtsi128_si32 ( 1.1399 + _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); 1.1400 + if (pm) 1.1401 + pm++; 1.1402 + } 1.1403 +} 1.1404 + 1.1405 +static force_inline uint32_t 1.1406 +core_combine_saturate_u_pixel_sse2 (uint32_t src, 1.1407 + uint32_t dst) 1.1408 +{ 1.1409 + __m128i ms = unpack_32_1x128 (src); 1.1410 + __m128i md = unpack_32_1x128 (dst); 1.1411 + uint32_t sa = src >> 24; 1.1412 + uint32_t da = ~dst >> 24; 1.1413 + 1.1414 + if (sa > da) 1.1415 + { 1.1416 + ms = pix_multiply_1x128 ( 1.1417 + ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); 1.1418 + } 1.1419 + 1.1420 + return pack_1x128_32 (_mm_adds_epu16 (md, ms)); 1.1421 +} 1.1422 + 1.1423 +static void 1.1424 +sse2_combine_saturate_u (pixman_implementation_t *imp, 1.1425 + pixman_op_t op, 1.1426 + uint32_t * pd, 1.1427 + const uint32_t * ps, 1.1428 + const uint32_t * pm, 1.1429 + int w) 1.1430 +{ 1.1431 + uint32_t s, d; 1.1432 + 1.1433 + uint32_t pack_cmp; 1.1434 + __m128i xmm_src, xmm_dst; 1.1435 + 1.1436 + while (w && (uintptr_t)pd & 15) 1.1437 + { 1.1438 + s = combine1 (ps, pm); 1.1439 + d = *pd; 1.1440 + 1.1441 + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1.1442 + w--; 1.1443 + ps++; 1.1444 + if (pm) 1.1445 + pm++; 1.1446 + } 1.1447 + 1.1448 + while (w >= 4) 1.1449 + { 1.1450 + xmm_dst = load_128_aligned ((__m128i*)pd); 1.1451 + xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); 1.1452 + 1.1453 + pack_cmp = _mm_movemask_epi8 ( 1.1454 + _mm_cmpgt_epi32 ( 1.1455 + _mm_srli_epi32 (xmm_src, 24), 1.1456 + _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); 1.1457 + 1.1458 + /* if some alpha src is grater than respective ~alpha dst */ 1.1459 + if (pack_cmp) 1.1460 + { 1.1461 + s = combine1 (ps++, pm); 1.1462 + d = *pd; 1.1463 + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1.1464 + if (pm) 1.1465 + pm++; 1.1466 + 1.1467 + s = combine1 (ps++, pm); 1.1468 + d = *pd; 1.1469 + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1.1470 + if (pm) 1.1471 + pm++; 1.1472 + 1.1473 + s = combine1 (ps++, pm); 1.1474 + d = *pd; 1.1475 + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1.1476 + if (pm) 1.1477 + pm++; 1.1478 + 1.1479 + s = combine1 (ps++, pm); 1.1480 + d = *pd; 1.1481 + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1.1482 + if (pm) 1.1483 + pm++; 1.1484 + } 1.1485 + else 1.1486 + { 1.1487 + save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); 1.1488 + 1.1489 + pd += 4; 1.1490 + ps += 4; 1.1491 + if (pm) 1.1492 + pm += 4; 1.1493 + } 1.1494 + 1.1495 + w -= 4; 1.1496 + } 1.1497 + 1.1498 + while (w--) 1.1499 + { 1.1500 + s = combine1 (ps, pm); 1.1501 + d = *pd; 1.1502 + 1.1503 + *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); 1.1504 + ps++; 1.1505 + if (pm) 1.1506 + pm++; 1.1507 + } 1.1508 +} 1.1509 + 1.1510 +static void 1.1511 +sse2_combine_src_ca (pixman_implementation_t *imp, 1.1512 + pixman_op_t op, 1.1513 + uint32_t * pd, 1.1514 + const uint32_t * ps, 1.1515 + const uint32_t * pm, 1.1516 + int w) 1.1517 +{ 1.1518 + uint32_t s, m; 1.1519 + 1.1520 + __m128i xmm_src_lo, xmm_src_hi; 1.1521 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1522 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1523 + 1.1524 + while (w && (uintptr_t)pd & 15) 1.1525 + { 1.1526 + s = *ps++; 1.1527 + m = *pm++; 1.1528 + *pd++ = pack_1x128_32 ( 1.1529 + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); 1.1530 + w--; 1.1531 + } 1.1532 + 1.1533 + while (w >= 4) 1.1534 + { 1.1535 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1536 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1537 + 1.1538 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1539 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1540 + 1.1541 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.1542 + &xmm_mask_lo, &xmm_mask_hi, 1.1543 + &xmm_dst_lo, &xmm_dst_hi); 1.1544 + 1.1545 + save_128_aligned ( 1.1546 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1547 + 1.1548 + ps += 4; 1.1549 + pd += 4; 1.1550 + pm += 4; 1.1551 + w -= 4; 1.1552 + } 1.1553 + 1.1554 + while (w) 1.1555 + { 1.1556 + s = *ps++; 1.1557 + m = *pm++; 1.1558 + *pd++ = pack_1x128_32 ( 1.1559 + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); 1.1560 + w--; 1.1561 + } 1.1562 +} 1.1563 + 1.1564 +static force_inline uint32_t 1.1565 +core_combine_over_ca_pixel_sse2 (uint32_t src, 1.1566 + uint32_t mask, 1.1567 + uint32_t dst) 1.1568 +{ 1.1569 + __m128i s = unpack_32_1x128 (src); 1.1570 + __m128i expAlpha = expand_alpha_1x128 (s); 1.1571 + __m128i unpk_mask = unpack_32_1x128 (mask); 1.1572 + __m128i unpk_dst = unpack_32_1x128 (dst); 1.1573 + 1.1574 + return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); 1.1575 +} 1.1576 + 1.1577 +static void 1.1578 +sse2_combine_over_ca (pixman_implementation_t *imp, 1.1579 + pixman_op_t op, 1.1580 + uint32_t * pd, 1.1581 + const uint32_t * ps, 1.1582 + const uint32_t * pm, 1.1583 + int w) 1.1584 +{ 1.1585 + uint32_t s, m, d; 1.1586 + 1.1587 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.1588 + __m128i xmm_src_lo, xmm_src_hi; 1.1589 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1590 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1591 + 1.1592 + while (w && (uintptr_t)pd & 15) 1.1593 + { 1.1594 + s = *ps++; 1.1595 + m = *pm++; 1.1596 + d = *pd; 1.1597 + 1.1598 + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); 1.1599 + w--; 1.1600 + } 1.1601 + 1.1602 + while (w >= 4) 1.1603 + { 1.1604 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.1605 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1606 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1607 + 1.1608 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1609 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1610 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1611 + 1.1612 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.1613 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1614 + 1.1615 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.1616 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1617 + &xmm_mask_lo, &xmm_mask_hi, 1.1618 + &xmm_dst_lo, &xmm_dst_hi); 1.1619 + 1.1620 + save_128_aligned ( 1.1621 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1622 + 1.1623 + ps += 4; 1.1624 + pd += 4; 1.1625 + pm += 4; 1.1626 + w -= 4; 1.1627 + } 1.1628 + 1.1629 + while (w) 1.1630 + { 1.1631 + s = *ps++; 1.1632 + m = *pm++; 1.1633 + d = *pd; 1.1634 + 1.1635 + *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); 1.1636 + w--; 1.1637 + } 1.1638 +} 1.1639 + 1.1640 +static force_inline uint32_t 1.1641 +core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, 1.1642 + uint32_t mask, 1.1643 + uint32_t dst) 1.1644 +{ 1.1645 + __m128i d = unpack_32_1x128 (dst); 1.1646 + 1.1647 + return pack_1x128_32 ( 1.1648 + over_1x128 (d, expand_alpha_1x128 (d), 1.1649 + pix_multiply_1x128 (unpack_32_1x128 (src), 1.1650 + unpack_32_1x128 (mask)))); 1.1651 +} 1.1652 + 1.1653 +static void 1.1654 +sse2_combine_over_reverse_ca (pixman_implementation_t *imp, 1.1655 + pixman_op_t op, 1.1656 + uint32_t * pd, 1.1657 + const uint32_t * ps, 1.1658 + const uint32_t * pm, 1.1659 + int w) 1.1660 +{ 1.1661 + uint32_t s, m, d; 1.1662 + 1.1663 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.1664 + __m128i xmm_src_lo, xmm_src_hi; 1.1665 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1666 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1667 + 1.1668 + while (w && (uintptr_t)pd & 15) 1.1669 + { 1.1670 + s = *ps++; 1.1671 + m = *pm++; 1.1672 + d = *pd; 1.1673 + 1.1674 + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); 1.1675 + w--; 1.1676 + } 1.1677 + 1.1678 + while (w >= 4) 1.1679 + { 1.1680 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.1681 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1682 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1683 + 1.1684 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1685 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1686 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1687 + 1.1688 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.1689 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1690 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.1691 + &xmm_mask_lo, &xmm_mask_hi, 1.1692 + &xmm_mask_lo, &xmm_mask_hi); 1.1693 + 1.1694 + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.1695 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1696 + &xmm_mask_lo, &xmm_mask_hi); 1.1697 + 1.1698 + save_128_aligned ( 1.1699 + (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); 1.1700 + 1.1701 + ps += 4; 1.1702 + pd += 4; 1.1703 + pm += 4; 1.1704 + w -= 4; 1.1705 + } 1.1706 + 1.1707 + while (w) 1.1708 + { 1.1709 + s = *ps++; 1.1710 + m = *pm++; 1.1711 + d = *pd; 1.1712 + 1.1713 + *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); 1.1714 + w--; 1.1715 + } 1.1716 +} 1.1717 + 1.1718 +static void 1.1719 +sse2_combine_in_ca (pixman_implementation_t *imp, 1.1720 + pixman_op_t op, 1.1721 + uint32_t * pd, 1.1722 + const uint32_t * ps, 1.1723 + const uint32_t * pm, 1.1724 + int w) 1.1725 +{ 1.1726 + uint32_t s, m, d; 1.1727 + 1.1728 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.1729 + __m128i xmm_src_lo, xmm_src_hi; 1.1730 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1731 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1732 + 1.1733 + while (w && (uintptr_t)pd & 15) 1.1734 + { 1.1735 + s = *ps++; 1.1736 + m = *pm++; 1.1737 + d = *pd; 1.1738 + 1.1739 + *pd++ = pack_1x128_32 ( 1.1740 + pix_multiply_1x128 ( 1.1741 + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), 1.1742 + expand_alpha_1x128 (unpack_32_1x128 (d)))); 1.1743 + 1.1744 + w--; 1.1745 + } 1.1746 + 1.1747 + while (w >= 4) 1.1748 + { 1.1749 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.1750 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1751 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1752 + 1.1753 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1754 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1755 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1756 + 1.1757 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.1758 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1759 + 1.1760 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.1761 + &xmm_mask_lo, &xmm_mask_hi, 1.1762 + &xmm_dst_lo, &xmm_dst_hi); 1.1763 + 1.1764 + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.1765 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1766 + &xmm_dst_lo, &xmm_dst_hi); 1.1767 + 1.1768 + save_128_aligned ( 1.1769 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1770 + 1.1771 + ps += 4; 1.1772 + pd += 4; 1.1773 + pm += 4; 1.1774 + w -= 4; 1.1775 + } 1.1776 + 1.1777 + while (w) 1.1778 + { 1.1779 + s = *ps++; 1.1780 + m = *pm++; 1.1781 + d = *pd; 1.1782 + 1.1783 + *pd++ = pack_1x128_32 ( 1.1784 + pix_multiply_1x128 ( 1.1785 + pix_multiply_1x128 ( 1.1786 + unpack_32_1x128 (s), unpack_32_1x128 (m)), 1.1787 + expand_alpha_1x128 (unpack_32_1x128 (d)))); 1.1788 + 1.1789 + w--; 1.1790 + } 1.1791 +} 1.1792 + 1.1793 +static void 1.1794 +sse2_combine_in_reverse_ca (pixman_implementation_t *imp, 1.1795 + pixman_op_t op, 1.1796 + uint32_t * pd, 1.1797 + const uint32_t * ps, 1.1798 + const uint32_t * pm, 1.1799 + int w) 1.1800 +{ 1.1801 + uint32_t s, m, d; 1.1802 + 1.1803 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.1804 + __m128i xmm_src_lo, xmm_src_hi; 1.1805 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1806 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1807 + 1.1808 + while (w && (uintptr_t)pd & 15) 1.1809 + { 1.1810 + s = *ps++; 1.1811 + m = *pm++; 1.1812 + d = *pd; 1.1813 + 1.1814 + *pd++ = pack_1x128_32 ( 1.1815 + pix_multiply_1x128 ( 1.1816 + unpack_32_1x128 (d), 1.1817 + pix_multiply_1x128 (unpack_32_1x128 (m), 1.1818 + expand_alpha_1x128 (unpack_32_1x128 (s))))); 1.1819 + w--; 1.1820 + } 1.1821 + 1.1822 + while (w >= 4) 1.1823 + { 1.1824 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.1825 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1826 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1827 + 1.1828 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1829 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1830 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1831 + 1.1832 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.1833 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1834 + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1.1835 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1836 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1837 + 1.1838 + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.1839 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1840 + &xmm_dst_lo, &xmm_dst_hi); 1.1841 + 1.1842 + save_128_aligned ( 1.1843 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1844 + 1.1845 + ps += 4; 1.1846 + pd += 4; 1.1847 + pm += 4; 1.1848 + w -= 4; 1.1849 + } 1.1850 + 1.1851 + while (w) 1.1852 + { 1.1853 + s = *ps++; 1.1854 + m = *pm++; 1.1855 + d = *pd; 1.1856 + 1.1857 + *pd++ = pack_1x128_32 ( 1.1858 + pix_multiply_1x128 ( 1.1859 + unpack_32_1x128 (d), 1.1860 + pix_multiply_1x128 (unpack_32_1x128 (m), 1.1861 + expand_alpha_1x128 (unpack_32_1x128 (s))))); 1.1862 + w--; 1.1863 + } 1.1864 +} 1.1865 + 1.1866 +static void 1.1867 +sse2_combine_out_ca (pixman_implementation_t *imp, 1.1868 + pixman_op_t op, 1.1869 + uint32_t * pd, 1.1870 + const uint32_t * ps, 1.1871 + const uint32_t * pm, 1.1872 + int w) 1.1873 +{ 1.1874 + uint32_t s, m, d; 1.1875 + 1.1876 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.1877 + __m128i xmm_src_lo, xmm_src_hi; 1.1878 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1879 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1880 + 1.1881 + while (w && (uintptr_t)pd & 15) 1.1882 + { 1.1883 + s = *ps++; 1.1884 + m = *pm++; 1.1885 + d = *pd; 1.1886 + 1.1887 + *pd++ = pack_1x128_32 ( 1.1888 + pix_multiply_1x128 ( 1.1889 + pix_multiply_1x128 ( 1.1890 + unpack_32_1x128 (s), unpack_32_1x128 (m)), 1.1891 + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); 1.1892 + w--; 1.1893 + } 1.1894 + 1.1895 + while (w >= 4) 1.1896 + { 1.1897 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.1898 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1899 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1900 + 1.1901 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1902 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1903 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1904 + 1.1905 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.1906 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1907 + negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, 1.1908 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1909 + 1.1910 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.1911 + &xmm_mask_lo, &xmm_mask_hi, 1.1912 + &xmm_dst_lo, &xmm_dst_hi); 1.1913 + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.1914 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1915 + &xmm_dst_lo, &xmm_dst_hi); 1.1916 + 1.1917 + save_128_aligned ( 1.1918 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1919 + 1.1920 + ps += 4; 1.1921 + pd += 4; 1.1922 + pm += 4; 1.1923 + w -= 4; 1.1924 + } 1.1925 + 1.1926 + while (w) 1.1927 + { 1.1928 + s = *ps++; 1.1929 + m = *pm++; 1.1930 + d = *pd; 1.1931 + 1.1932 + *pd++ = pack_1x128_32 ( 1.1933 + pix_multiply_1x128 ( 1.1934 + pix_multiply_1x128 ( 1.1935 + unpack_32_1x128 (s), unpack_32_1x128 (m)), 1.1936 + negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); 1.1937 + 1.1938 + w--; 1.1939 + } 1.1940 +} 1.1941 + 1.1942 +static void 1.1943 +sse2_combine_out_reverse_ca (pixman_implementation_t *imp, 1.1944 + pixman_op_t op, 1.1945 + uint32_t * pd, 1.1946 + const uint32_t * ps, 1.1947 + const uint32_t * pm, 1.1948 + int w) 1.1949 +{ 1.1950 + uint32_t s, m, d; 1.1951 + 1.1952 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.1953 + __m128i xmm_src_lo, xmm_src_hi; 1.1954 + __m128i xmm_dst_lo, xmm_dst_hi; 1.1955 + __m128i xmm_mask_lo, xmm_mask_hi; 1.1956 + 1.1957 + while (w && (uintptr_t)pd & 15) 1.1958 + { 1.1959 + s = *ps++; 1.1960 + m = *pm++; 1.1961 + d = *pd; 1.1962 + 1.1963 + *pd++ = pack_1x128_32 ( 1.1964 + pix_multiply_1x128 ( 1.1965 + unpack_32_1x128 (d), 1.1966 + negate_1x128 (pix_multiply_1x128 ( 1.1967 + unpack_32_1x128 (m), 1.1968 + expand_alpha_1x128 (unpack_32_1x128 (s)))))); 1.1969 + w--; 1.1970 + } 1.1971 + 1.1972 + while (w >= 4) 1.1973 + { 1.1974 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.1975 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.1976 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.1977 + 1.1978 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.1979 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.1980 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.1981 + 1.1982 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.1983 + &xmm_alpha_lo, &xmm_alpha_hi); 1.1984 + 1.1985 + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1.1986 + &xmm_alpha_lo, &xmm_alpha_hi, 1.1987 + &xmm_mask_lo, &xmm_mask_hi); 1.1988 + 1.1989 + negate_2x128 (xmm_mask_lo, xmm_mask_hi, 1.1990 + &xmm_mask_lo, &xmm_mask_hi); 1.1991 + 1.1992 + pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.1993 + &xmm_mask_lo, &xmm_mask_hi, 1.1994 + &xmm_dst_lo, &xmm_dst_hi); 1.1995 + 1.1996 + save_128_aligned ( 1.1997 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.1998 + 1.1999 + ps += 4; 1.2000 + pd += 4; 1.2001 + pm += 4; 1.2002 + w -= 4; 1.2003 + } 1.2004 + 1.2005 + while (w) 1.2006 + { 1.2007 + s = *ps++; 1.2008 + m = *pm++; 1.2009 + d = *pd; 1.2010 + 1.2011 + *pd++ = pack_1x128_32 ( 1.2012 + pix_multiply_1x128 ( 1.2013 + unpack_32_1x128 (d), 1.2014 + negate_1x128 (pix_multiply_1x128 ( 1.2015 + unpack_32_1x128 (m), 1.2016 + expand_alpha_1x128 (unpack_32_1x128 (s)))))); 1.2017 + w--; 1.2018 + } 1.2019 +} 1.2020 + 1.2021 +static force_inline uint32_t 1.2022 +core_combine_atop_ca_pixel_sse2 (uint32_t src, 1.2023 + uint32_t mask, 1.2024 + uint32_t dst) 1.2025 +{ 1.2026 + __m128i m = unpack_32_1x128 (mask); 1.2027 + __m128i s = unpack_32_1x128 (src); 1.2028 + __m128i d = unpack_32_1x128 (dst); 1.2029 + __m128i sa = expand_alpha_1x128 (s); 1.2030 + __m128i da = expand_alpha_1x128 (d); 1.2031 + 1.2032 + s = pix_multiply_1x128 (s, m); 1.2033 + m = negate_1x128 (pix_multiply_1x128 (m, sa)); 1.2034 + 1.2035 + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); 1.2036 +} 1.2037 + 1.2038 +static void 1.2039 +sse2_combine_atop_ca (pixman_implementation_t *imp, 1.2040 + pixman_op_t op, 1.2041 + uint32_t * pd, 1.2042 + const uint32_t * ps, 1.2043 + const uint32_t * pm, 1.2044 + int w) 1.2045 +{ 1.2046 + uint32_t s, m, d; 1.2047 + 1.2048 + __m128i xmm_src_lo, xmm_src_hi; 1.2049 + __m128i xmm_dst_lo, xmm_dst_hi; 1.2050 + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1.2051 + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1.2052 + __m128i xmm_mask_lo, xmm_mask_hi; 1.2053 + 1.2054 + while (w && (uintptr_t)pd & 15) 1.2055 + { 1.2056 + s = *ps++; 1.2057 + m = *pm++; 1.2058 + d = *pd; 1.2059 + 1.2060 + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); 1.2061 + w--; 1.2062 + } 1.2063 + 1.2064 + while (w >= 4) 1.2065 + { 1.2066 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.2067 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.2068 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.2069 + 1.2070 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.2071 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.2072 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.2073 + 1.2074 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.2075 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.2076 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.2077 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.2078 + 1.2079 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.2080 + &xmm_mask_lo, &xmm_mask_hi, 1.2081 + &xmm_src_lo, &xmm_src_hi); 1.2082 + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1.2083 + &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1.2084 + &xmm_mask_lo, &xmm_mask_hi); 1.2085 + 1.2086 + negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.2087 + 1.2088 + pix_add_multiply_2x128 ( 1.2089 + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 1.2090 + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1.2091 + &xmm_dst_lo, &xmm_dst_hi); 1.2092 + 1.2093 + save_128_aligned ( 1.2094 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.2095 + 1.2096 + ps += 4; 1.2097 + pd += 4; 1.2098 + pm += 4; 1.2099 + w -= 4; 1.2100 + } 1.2101 + 1.2102 + while (w) 1.2103 + { 1.2104 + s = *ps++; 1.2105 + m = *pm++; 1.2106 + d = *pd; 1.2107 + 1.2108 + *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); 1.2109 + w--; 1.2110 + } 1.2111 +} 1.2112 + 1.2113 +static force_inline uint32_t 1.2114 +core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, 1.2115 + uint32_t mask, 1.2116 + uint32_t dst) 1.2117 +{ 1.2118 + __m128i m = unpack_32_1x128 (mask); 1.2119 + __m128i s = unpack_32_1x128 (src); 1.2120 + __m128i d = unpack_32_1x128 (dst); 1.2121 + 1.2122 + __m128i da = negate_1x128 (expand_alpha_1x128 (d)); 1.2123 + __m128i sa = expand_alpha_1x128 (s); 1.2124 + 1.2125 + s = pix_multiply_1x128 (s, m); 1.2126 + m = pix_multiply_1x128 (m, sa); 1.2127 + 1.2128 + return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); 1.2129 +} 1.2130 + 1.2131 +static void 1.2132 +sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, 1.2133 + pixman_op_t op, 1.2134 + uint32_t * pd, 1.2135 + const uint32_t * ps, 1.2136 + const uint32_t * pm, 1.2137 + int w) 1.2138 +{ 1.2139 + uint32_t s, m, d; 1.2140 + 1.2141 + __m128i xmm_src_lo, xmm_src_hi; 1.2142 + __m128i xmm_dst_lo, xmm_dst_hi; 1.2143 + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1.2144 + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1.2145 + __m128i xmm_mask_lo, xmm_mask_hi; 1.2146 + 1.2147 + while (w && (uintptr_t)pd & 15) 1.2148 + { 1.2149 + s = *ps++; 1.2150 + m = *pm++; 1.2151 + d = *pd; 1.2152 + 1.2153 + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); 1.2154 + w--; 1.2155 + } 1.2156 + 1.2157 + while (w >= 4) 1.2158 + { 1.2159 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.2160 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.2161 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.2162 + 1.2163 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.2164 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.2165 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.2166 + 1.2167 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.2168 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.2169 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.2170 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.2171 + 1.2172 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.2173 + &xmm_mask_lo, &xmm_mask_hi, 1.2174 + &xmm_src_lo, &xmm_src_hi); 1.2175 + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1.2176 + &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1.2177 + &xmm_mask_lo, &xmm_mask_hi); 1.2178 + 1.2179 + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1.2180 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.2181 + 1.2182 + pix_add_multiply_2x128 ( 1.2183 + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 1.2184 + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1.2185 + &xmm_dst_lo, &xmm_dst_hi); 1.2186 + 1.2187 + save_128_aligned ( 1.2188 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.2189 + 1.2190 + ps += 4; 1.2191 + pd += 4; 1.2192 + pm += 4; 1.2193 + w -= 4; 1.2194 + } 1.2195 + 1.2196 + while (w) 1.2197 + { 1.2198 + s = *ps++; 1.2199 + m = *pm++; 1.2200 + d = *pd; 1.2201 + 1.2202 + *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); 1.2203 + w--; 1.2204 + } 1.2205 +} 1.2206 + 1.2207 +static force_inline uint32_t 1.2208 +core_combine_xor_ca_pixel_sse2 (uint32_t src, 1.2209 + uint32_t mask, 1.2210 + uint32_t dst) 1.2211 +{ 1.2212 + __m128i a = unpack_32_1x128 (mask); 1.2213 + __m128i s = unpack_32_1x128 (src); 1.2214 + __m128i d = unpack_32_1x128 (dst); 1.2215 + 1.2216 + __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( 1.2217 + a, expand_alpha_1x128 (s))); 1.2218 + __m128i dest = pix_multiply_1x128 (s, a); 1.2219 + __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); 1.2220 + 1.2221 + return pack_1x128_32 (pix_add_multiply_1x128 (&d, 1.2222 + &alpha_dst, 1.2223 + &dest, 1.2224 + &alpha_src)); 1.2225 +} 1.2226 + 1.2227 +static void 1.2228 +sse2_combine_xor_ca (pixman_implementation_t *imp, 1.2229 + pixman_op_t op, 1.2230 + uint32_t * pd, 1.2231 + const uint32_t * ps, 1.2232 + const uint32_t * pm, 1.2233 + int w) 1.2234 +{ 1.2235 + uint32_t s, m, d; 1.2236 + 1.2237 + __m128i xmm_src_lo, xmm_src_hi; 1.2238 + __m128i xmm_dst_lo, xmm_dst_hi; 1.2239 + __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; 1.2240 + __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; 1.2241 + __m128i xmm_mask_lo, xmm_mask_hi; 1.2242 + 1.2243 + while (w && (uintptr_t)pd & 15) 1.2244 + { 1.2245 + s = *ps++; 1.2246 + m = *pm++; 1.2247 + d = *pd; 1.2248 + 1.2249 + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); 1.2250 + w--; 1.2251 + } 1.2252 + 1.2253 + while (w >= 4) 1.2254 + { 1.2255 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.2256 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.2257 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.2258 + 1.2259 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.2260 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.2261 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.2262 + 1.2263 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.2264 + &xmm_alpha_src_lo, &xmm_alpha_src_hi); 1.2265 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, 1.2266 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.2267 + 1.2268 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.2269 + &xmm_mask_lo, &xmm_mask_hi, 1.2270 + &xmm_src_lo, &xmm_src_hi); 1.2271 + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1.2272 + &xmm_alpha_src_lo, &xmm_alpha_src_hi, 1.2273 + &xmm_mask_lo, &xmm_mask_hi); 1.2274 + 1.2275 + negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, 1.2276 + &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); 1.2277 + negate_2x128 (xmm_mask_lo, xmm_mask_hi, 1.2278 + &xmm_mask_lo, &xmm_mask_hi); 1.2279 + 1.2280 + pix_add_multiply_2x128 ( 1.2281 + &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, 1.2282 + &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, 1.2283 + &xmm_dst_lo, &xmm_dst_hi); 1.2284 + 1.2285 + save_128_aligned ( 1.2286 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.2287 + 1.2288 + ps += 4; 1.2289 + pd += 4; 1.2290 + pm += 4; 1.2291 + w -= 4; 1.2292 + } 1.2293 + 1.2294 + while (w) 1.2295 + { 1.2296 + s = *ps++; 1.2297 + m = *pm++; 1.2298 + d = *pd; 1.2299 + 1.2300 + *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); 1.2301 + w--; 1.2302 + } 1.2303 +} 1.2304 + 1.2305 +static void 1.2306 +sse2_combine_add_ca (pixman_implementation_t *imp, 1.2307 + pixman_op_t op, 1.2308 + uint32_t * pd, 1.2309 + const uint32_t * ps, 1.2310 + const uint32_t * pm, 1.2311 + int w) 1.2312 +{ 1.2313 + uint32_t s, m, d; 1.2314 + 1.2315 + __m128i xmm_src_lo, xmm_src_hi; 1.2316 + __m128i xmm_dst_lo, xmm_dst_hi; 1.2317 + __m128i xmm_mask_lo, xmm_mask_hi; 1.2318 + 1.2319 + while (w && (uintptr_t)pd & 15) 1.2320 + { 1.2321 + s = *ps++; 1.2322 + m = *pm++; 1.2323 + d = *pd; 1.2324 + 1.2325 + *pd++ = pack_1x128_32 ( 1.2326 + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), 1.2327 + unpack_32_1x128 (m)), 1.2328 + unpack_32_1x128 (d))); 1.2329 + w--; 1.2330 + } 1.2331 + 1.2332 + while (w >= 4) 1.2333 + { 1.2334 + xmm_src_hi = load_128_unaligned ((__m128i*)ps); 1.2335 + xmm_mask_hi = load_128_unaligned ((__m128i*)pm); 1.2336 + xmm_dst_hi = load_128_aligned ((__m128i*)pd); 1.2337 + 1.2338 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.2339 + unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.2340 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.2341 + 1.2342 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.2343 + &xmm_mask_lo, &xmm_mask_hi, 1.2344 + &xmm_src_lo, &xmm_src_hi); 1.2345 + 1.2346 + save_128_aligned ( 1.2347 + (__m128i*)pd, pack_2x128_128 ( 1.2348 + _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), 1.2349 + _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); 1.2350 + 1.2351 + ps += 4; 1.2352 + pd += 4; 1.2353 + pm += 4; 1.2354 + w -= 4; 1.2355 + } 1.2356 + 1.2357 + while (w) 1.2358 + { 1.2359 + s = *ps++; 1.2360 + m = *pm++; 1.2361 + d = *pd; 1.2362 + 1.2363 + *pd++ = pack_1x128_32 ( 1.2364 + _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), 1.2365 + unpack_32_1x128 (m)), 1.2366 + unpack_32_1x128 (d))); 1.2367 + w--; 1.2368 + } 1.2369 +} 1.2370 + 1.2371 +static force_inline __m128i 1.2372 +create_mask_16_128 (uint16_t mask) 1.2373 +{ 1.2374 + return _mm_set1_epi16 (mask); 1.2375 +} 1.2376 + 1.2377 +/* Work around a code generation bug in Sun Studio 12. */ 1.2378 +#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) 1.2379 +# define create_mask_2x32_128(mask0, mask1) \ 1.2380 + (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) 1.2381 +#else 1.2382 +static force_inline __m128i 1.2383 +create_mask_2x32_128 (uint32_t mask0, 1.2384 + uint32_t mask1) 1.2385 +{ 1.2386 + return _mm_set_epi32 (mask0, mask1, mask0, mask1); 1.2387 +} 1.2388 +#endif 1.2389 + 1.2390 +static void 1.2391 +sse2_composite_over_n_8888 (pixman_implementation_t *imp, 1.2392 + pixman_composite_info_t *info) 1.2393 +{ 1.2394 + PIXMAN_COMPOSITE_ARGS (info); 1.2395 + uint32_t src; 1.2396 + uint32_t *dst_line, *dst, d; 1.2397 + int32_t w; 1.2398 + int dst_stride; 1.2399 + __m128i xmm_src, xmm_alpha; 1.2400 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.2401 + 1.2402 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2403 + 1.2404 + if (src == 0) 1.2405 + return; 1.2406 + 1.2407 + PIXMAN_IMAGE_GET_LINE ( 1.2408 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2409 + 1.2410 + xmm_src = expand_pixel_32_1x128 (src); 1.2411 + xmm_alpha = expand_alpha_1x128 (xmm_src); 1.2412 + 1.2413 + while (height--) 1.2414 + { 1.2415 + dst = dst_line; 1.2416 + 1.2417 + dst_line += dst_stride; 1.2418 + w = width; 1.2419 + 1.2420 + while (w && (uintptr_t)dst & 15) 1.2421 + { 1.2422 + d = *dst; 1.2423 + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, 1.2424 + xmm_alpha, 1.2425 + unpack_32_1x128 (d))); 1.2426 + w--; 1.2427 + } 1.2428 + 1.2429 + while (w >= 4) 1.2430 + { 1.2431 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.2432 + 1.2433 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.2434 + 1.2435 + over_2x128 (&xmm_src, &xmm_src, 1.2436 + &xmm_alpha, &xmm_alpha, 1.2437 + &xmm_dst_lo, &xmm_dst_hi); 1.2438 + 1.2439 + /* rebuid the 4 pixel data and save*/ 1.2440 + save_128_aligned ( 1.2441 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.2442 + 1.2443 + w -= 4; 1.2444 + dst += 4; 1.2445 + } 1.2446 + 1.2447 + while (w) 1.2448 + { 1.2449 + d = *dst; 1.2450 + *dst++ = pack_1x128_32 (over_1x128 (xmm_src, 1.2451 + xmm_alpha, 1.2452 + unpack_32_1x128 (d))); 1.2453 + w--; 1.2454 + } 1.2455 + 1.2456 + } 1.2457 +} 1.2458 + 1.2459 +static void 1.2460 +sse2_composite_over_n_0565 (pixman_implementation_t *imp, 1.2461 + pixman_composite_info_t *info) 1.2462 +{ 1.2463 + PIXMAN_COMPOSITE_ARGS (info); 1.2464 + uint32_t src; 1.2465 + uint16_t *dst_line, *dst, d; 1.2466 + int32_t w; 1.2467 + int dst_stride; 1.2468 + __m128i xmm_src, xmm_alpha; 1.2469 + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 1.2470 + 1.2471 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2472 + 1.2473 + if (src == 0) 1.2474 + return; 1.2475 + 1.2476 + PIXMAN_IMAGE_GET_LINE ( 1.2477 + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.2478 + 1.2479 + xmm_src = expand_pixel_32_1x128 (src); 1.2480 + xmm_alpha = expand_alpha_1x128 (xmm_src); 1.2481 + 1.2482 + while (height--) 1.2483 + { 1.2484 + dst = dst_line; 1.2485 + 1.2486 + dst_line += dst_stride; 1.2487 + w = width; 1.2488 + 1.2489 + while (w && (uintptr_t)dst & 15) 1.2490 + { 1.2491 + d = *dst; 1.2492 + 1.2493 + *dst++ = pack_565_32_16 ( 1.2494 + pack_1x128_32 (over_1x128 (xmm_src, 1.2495 + xmm_alpha, 1.2496 + expand565_16_1x128 (d)))); 1.2497 + w--; 1.2498 + } 1.2499 + 1.2500 + while (w >= 8) 1.2501 + { 1.2502 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.2503 + 1.2504 + unpack_565_128_4x128 (xmm_dst, 1.2505 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 1.2506 + 1.2507 + over_2x128 (&xmm_src, &xmm_src, 1.2508 + &xmm_alpha, &xmm_alpha, 1.2509 + &xmm_dst0, &xmm_dst1); 1.2510 + over_2x128 (&xmm_src, &xmm_src, 1.2511 + &xmm_alpha, &xmm_alpha, 1.2512 + &xmm_dst2, &xmm_dst3); 1.2513 + 1.2514 + xmm_dst = pack_565_4x128_128 ( 1.2515 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 1.2516 + 1.2517 + save_128_aligned ((__m128i*)dst, xmm_dst); 1.2518 + 1.2519 + dst += 8; 1.2520 + w -= 8; 1.2521 + } 1.2522 + 1.2523 + while (w--) 1.2524 + { 1.2525 + d = *dst; 1.2526 + *dst++ = pack_565_32_16 ( 1.2527 + pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, 1.2528 + expand565_16_1x128 (d)))); 1.2529 + } 1.2530 + } 1.2531 + 1.2532 +} 1.2533 + 1.2534 +static void 1.2535 +sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, 1.2536 + pixman_composite_info_t *info) 1.2537 +{ 1.2538 + PIXMAN_COMPOSITE_ARGS (info); 1.2539 + uint32_t src; 1.2540 + uint32_t *dst_line, d; 1.2541 + uint32_t *mask_line, m; 1.2542 + uint32_t pack_cmp; 1.2543 + int dst_stride, mask_stride; 1.2544 + 1.2545 + __m128i xmm_src; 1.2546 + __m128i xmm_dst; 1.2547 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.2548 + 1.2549 + __m128i mmx_src, mmx_mask, mmx_dest; 1.2550 + 1.2551 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2552 + 1.2553 + if (src == 0) 1.2554 + return; 1.2555 + 1.2556 + PIXMAN_IMAGE_GET_LINE ( 1.2557 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2558 + PIXMAN_IMAGE_GET_LINE ( 1.2559 + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1.2560 + 1.2561 + xmm_src = _mm_unpacklo_epi8 ( 1.2562 + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); 1.2563 + mmx_src = xmm_src; 1.2564 + 1.2565 + while (height--) 1.2566 + { 1.2567 + int w = width; 1.2568 + const uint32_t *pm = (uint32_t *)mask_line; 1.2569 + uint32_t *pd = (uint32_t *)dst_line; 1.2570 + 1.2571 + dst_line += dst_stride; 1.2572 + mask_line += mask_stride; 1.2573 + 1.2574 + while (w && (uintptr_t)pd & 15) 1.2575 + { 1.2576 + m = *pm++; 1.2577 + 1.2578 + if (m) 1.2579 + { 1.2580 + d = *pd; 1.2581 + 1.2582 + mmx_mask = unpack_32_1x128 (m); 1.2583 + mmx_dest = unpack_32_1x128 (d); 1.2584 + 1.2585 + *pd = pack_1x128_32 ( 1.2586 + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), 1.2587 + mmx_dest)); 1.2588 + } 1.2589 + 1.2590 + pd++; 1.2591 + w--; 1.2592 + } 1.2593 + 1.2594 + while (w >= 4) 1.2595 + { 1.2596 + xmm_mask = load_128_unaligned ((__m128i*)pm); 1.2597 + 1.2598 + pack_cmp = 1.2599 + _mm_movemask_epi8 ( 1.2600 + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 1.2601 + 1.2602 + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ 1.2603 + if (pack_cmp != 0xffff) 1.2604 + { 1.2605 + xmm_dst = load_128_aligned ((__m128i*)pd); 1.2606 + 1.2607 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.2608 + 1.2609 + pix_multiply_2x128 (&xmm_src, &xmm_src, 1.2610 + &xmm_mask_lo, &xmm_mask_hi, 1.2611 + &xmm_mask_lo, &xmm_mask_hi); 1.2612 + xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); 1.2613 + 1.2614 + save_128_aligned ( 1.2615 + (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); 1.2616 + } 1.2617 + 1.2618 + pd += 4; 1.2619 + pm += 4; 1.2620 + w -= 4; 1.2621 + } 1.2622 + 1.2623 + while (w) 1.2624 + { 1.2625 + m = *pm++; 1.2626 + 1.2627 + if (m) 1.2628 + { 1.2629 + d = *pd; 1.2630 + 1.2631 + mmx_mask = unpack_32_1x128 (m); 1.2632 + mmx_dest = unpack_32_1x128 (d); 1.2633 + 1.2634 + *pd = pack_1x128_32 ( 1.2635 + _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), 1.2636 + mmx_dest)); 1.2637 + } 1.2638 + 1.2639 + pd++; 1.2640 + w--; 1.2641 + } 1.2642 + } 1.2643 + 1.2644 +} 1.2645 + 1.2646 +static void 1.2647 +sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 1.2648 + pixman_composite_info_t *info) 1.2649 +{ 1.2650 + PIXMAN_COMPOSITE_ARGS (info); 1.2651 + uint32_t src; 1.2652 + uint32_t *dst_line, d; 1.2653 + uint32_t *mask_line, m; 1.2654 + uint32_t pack_cmp; 1.2655 + int dst_stride, mask_stride; 1.2656 + 1.2657 + __m128i xmm_src, xmm_alpha; 1.2658 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.2659 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.2660 + 1.2661 + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 1.2662 + 1.2663 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2664 + 1.2665 + if (src == 0) 1.2666 + return; 1.2667 + 1.2668 + PIXMAN_IMAGE_GET_LINE ( 1.2669 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2670 + PIXMAN_IMAGE_GET_LINE ( 1.2671 + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1.2672 + 1.2673 + xmm_src = _mm_unpacklo_epi8 ( 1.2674 + create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); 1.2675 + xmm_alpha = expand_alpha_1x128 (xmm_src); 1.2676 + mmx_src = xmm_src; 1.2677 + mmx_alpha = xmm_alpha; 1.2678 + 1.2679 + while (height--) 1.2680 + { 1.2681 + int w = width; 1.2682 + const uint32_t *pm = (uint32_t *)mask_line; 1.2683 + uint32_t *pd = (uint32_t *)dst_line; 1.2684 + 1.2685 + dst_line += dst_stride; 1.2686 + mask_line += mask_stride; 1.2687 + 1.2688 + while (w && (uintptr_t)pd & 15) 1.2689 + { 1.2690 + m = *pm++; 1.2691 + 1.2692 + if (m) 1.2693 + { 1.2694 + d = *pd; 1.2695 + mmx_mask = unpack_32_1x128 (m); 1.2696 + mmx_dest = unpack_32_1x128 (d); 1.2697 + 1.2698 + *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, 1.2699 + &mmx_alpha, 1.2700 + &mmx_mask, 1.2701 + &mmx_dest)); 1.2702 + } 1.2703 + 1.2704 + pd++; 1.2705 + w--; 1.2706 + } 1.2707 + 1.2708 + while (w >= 4) 1.2709 + { 1.2710 + xmm_mask = load_128_unaligned ((__m128i*)pm); 1.2711 + 1.2712 + pack_cmp = 1.2713 + _mm_movemask_epi8 ( 1.2714 + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 1.2715 + 1.2716 + /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ 1.2717 + if (pack_cmp != 0xffff) 1.2718 + { 1.2719 + xmm_dst = load_128_aligned ((__m128i*)pd); 1.2720 + 1.2721 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.2722 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.2723 + 1.2724 + in_over_2x128 (&xmm_src, &xmm_src, 1.2725 + &xmm_alpha, &xmm_alpha, 1.2726 + &xmm_mask_lo, &xmm_mask_hi, 1.2727 + &xmm_dst_lo, &xmm_dst_hi); 1.2728 + 1.2729 + save_128_aligned ( 1.2730 + (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.2731 + } 1.2732 + 1.2733 + pd += 4; 1.2734 + pm += 4; 1.2735 + w -= 4; 1.2736 + } 1.2737 + 1.2738 + while (w) 1.2739 + { 1.2740 + m = *pm++; 1.2741 + 1.2742 + if (m) 1.2743 + { 1.2744 + d = *pd; 1.2745 + mmx_mask = unpack_32_1x128 (m); 1.2746 + mmx_dest = unpack_32_1x128 (d); 1.2747 + 1.2748 + *pd = pack_1x128_32 ( 1.2749 + in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); 1.2750 + } 1.2751 + 1.2752 + pd++; 1.2753 + w--; 1.2754 + } 1.2755 + } 1.2756 + 1.2757 +} 1.2758 + 1.2759 +static void 1.2760 +sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, 1.2761 + pixman_composite_info_t *info) 1.2762 +{ 1.2763 + PIXMAN_COMPOSITE_ARGS (info); 1.2764 + uint32_t *dst_line, *dst; 1.2765 + uint32_t *src_line, *src; 1.2766 + uint32_t mask; 1.2767 + int32_t w; 1.2768 + int dst_stride, src_stride; 1.2769 + 1.2770 + __m128i xmm_mask; 1.2771 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.2772 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.2773 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.2774 + 1.2775 + PIXMAN_IMAGE_GET_LINE ( 1.2776 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2777 + PIXMAN_IMAGE_GET_LINE ( 1.2778 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2779 + 1.2780 + mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); 1.2781 + 1.2782 + xmm_mask = create_mask_16_128 (mask >> 24); 1.2783 + 1.2784 + while (height--) 1.2785 + { 1.2786 + dst = dst_line; 1.2787 + dst_line += dst_stride; 1.2788 + src = src_line; 1.2789 + src_line += src_stride; 1.2790 + w = width; 1.2791 + 1.2792 + while (w && (uintptr_t)dst & 15) 1.2793 + { 1.2794 + uint32_t s = *src++; 1.2795 + 1.2796 + if (s) 1.2797 + { 1.2798 + uint32_t d = *dst; 1.2799 + 1.2800 + __m128i ms = unpack_32_1x128 (s); 1.2801 + __m128i alpha = expand_alpha_1x128 (ms); 1.2802 + __m128i dest = xmm_mask; 1.2803 + __m128i alpha_dst = unpack_32_1x128 (d); 1.2804 + 1.2805 + *dst = pack_1x128_32 ( 1.2806 + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 1.2807 + } 1.2808 + dst++; 1.2809 + w--; 1.2810 + } 1.2811 + 1.2812 + while (w >= 4) 1.2813 + { 1.2814 + xmm_src = load_128_unaligned ((__m128i*)src); 1.2815 + 1.2816 + if (!is_zero (xmm_src)) 1.2817 + { 1.2818 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.2819 + 1.2820 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.2821 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.2822 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.2823 + &xmm_alpha_lo, &xmm_alpha_hi); 1.2824 + 1.2825 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.2826 + &xmm_alpha_lo, &xmm_alpha_hi, 1.2827 + &xmm_mask, &xmm_mask, 1.2828 + &xmm_dst_lo, &xmm_dst_hi); 1.2829 + 1.2830 + save_128_aligned ( 1.2831 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.2832 + } 1.2833 + 1.2834 + dst += 4; 1.2835 + src += 4; 1.2836 + w -= 4; 1.2837 + } 1.2838 + 1.2839 + while (w) 1.2840 + { 1.2841 + uint32_t s = *src++; 1.2842 + 1.2843 + if (s) 1.2844 + { 1.2845 + uint32_t d = *dst; 1.2846 + 1.2847 + __m128i ms = unpack_32_1x128 (s); 1.2848 + __m128i alpha = expand_alpha_1x128 (ms); 1.2849 + __m128i mask = xmm_mask; 1.2850 + __m128i dest = unpack_32_1x128 (d); 1.2851 + 1.2852 + *dst = pack_1x128_32 ( 1.2853 + in_over_1x128 (&ms, &alpha, &mask, &dest)); 1.2854 + } 1.2855 + 1.2856 + dst++; 1.2857 + w--; 1.2858 + } 1.2859 + } 1.2860 + 1.2861 +} 1.2862 + 1.2863 +static void 1.2864 +sse2_composite_src_x888_0565 (pixman_implementation_t *imp, 1.2865 + pixman_composite_info_t *info) 1.2866 +{ 1.2867 + PIXMAN_COMPOSITE_ARGS (info); 1.2868 + uint16_t *dst_line, *dst; 1.2869 + uint32_t *src_line, *src, s; 1.2870 + int dst_stride, src_stride; 1.2871 + int32_t w; 1.2872 + 1.2873 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2874 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.2875 + 1.2876 + while (height--) 1.2877 + { 1.2878 + dst = dst_line; 1.2879 + dst_line += dst_stride; 1.2880 + src = src_line; 1.2881 + src_line += src_stride; 1.2882 + w = width; 1.2883 + 1.2884 + while (w && (uintptr_t)dst & 15) 1.2885 + { 1.2886 + s = *src++; 1.2887 + *dst = convert_8888_to_0565 (s); 1.2888 + dst++; 1.2889 + w--; 1.2890 + } 1.2891 + 1.2892 + while (w >= 8) 1.2893 + { 1.2894 + __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); 1.2895 + __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); 1.2896 + 1.2897 + save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); 1.2898 + 1.2899 + w -= 8; 1.2900 + src += 8; 1.2901 + dst += 8; 1.2902 + } 1.2903 + 1.2904 + while (w) 1.2905 + { 1.2906 + s = *src++; 1.2907 + *dst = convert_8888_to_0565 (s); 1.2908 + dst++; 1.2909 + w--; 1.2910 + } 1.2911 + } 1.2912 +} 1.2913 + 1.2914 +static void 1.2915 +sse2_composite_src_x888_8888 (pixman_implementation_t *imp, 1.2916 + pixman_composite_info_t *info) 1.2917 +{ 1.2918 + PIXMAN_COMPOSITE_ARGS (info); 1.2919 + uint32_t *dst_line, *dst; 1.2920 + uint32_t *src_line, *src; 1.2921 + int32_t w; 1.2922 + int dst_stride, src_stride; 1.2923 + 1.2924 + 1.2925 + PIXMAN_IMAGE_GET_LINE ( 1.2926 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2927 + PIXMAN_IMAGE_GET_LINE ( 1.2928 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2929 + 1.2930 + while (height--) 1.2931 + { 1.2932 + dst = dst_line; 1.2933 + dst_line += dst_stride; 1.2934 + src = src_line; 1.2935 + src_line += src_stride; 1.2936 + w = width; 1.2937 + 1.2938 + while (w && (uintptr_t)dst & 15) 1.2939 + { 1.2940 + *dst++ = *src++ | 0xff000000; 1.2941 + w--; 1.2942 + } 1.2943 + 1.2944 + while (w >= 16) 1.2945 + { 1.2946 + __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; 1.2947 + 1.2948 + xmm_src1 = load_128_unaligned ((__m128i*)src + 0); 1.2949 + xmm_src2 = load_128_unaligned ((__m128i*)src + 1); 1.2950 + xmm_src3 = load_128_unaligned ((__m128i*)src + 2); 1.2951 + xmm_src4 = load_128_unaligned ((__m128i*)src + 3); 1.2952 + 1.2953 + save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); 1.2954 + save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); 1.2955 + save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); 1.2956 + save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); 1.2957 + 1.2958 + dst += 16; 1.2959 + src += 16; 1.2960 + w -= 16; 1.2961 + } 1.2962 + 1.2963 + while (w) 1.2964 + { 1.2965 + *dst++ = *src++ | 0xff000000; 1.2966 + w--; 1.2967 + } 1.2968 + } 1.2969 + 1.2970 +} 1.2971 + 1.2972 +static void 1.2973 +sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, 1.2974 + pixman_composite_info_t *info) 1.2975 +{ 1.2976 + PIXMAN_COMPOSITE_ARGS (info); 1.2977 + uint32_t *dst_line, *dst; 1.2978 + uint32_t *src_line, *src; 1.2979 + uint32_t mask; 1.2980 + int dst_stride, src_stride; 1.2981 + int32_t w; 1.2982 + 1.2983 + __m128i xmm_mask, xmm_alpha; 1.2984 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.2985 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.2986 + 1.2987 + PIXMAN_IMAGE_GET_LINE ( 1.2988 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2989 + PIXMAN_IMAGE_GET_LINE ( 1.2990 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2991 + 1.2992 + mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); 1.2993 + 1.2994 + xmm_mask = create_mask_16_128 (mask >> 24); 1.2995 + xmm_alpha = mask_00ff; 1.2996 + 1.2997 + while (height--) 1.2998 + { 1.2999 + dst = dst_line; 1.3000 + dst_line += dst_stride; 1.3001 + src = src_line; 1.3002 + src_line += src_stride; 1.3003 + w = width; 1.3004 + 1.3005 + while (w && (uintptr_t)dst & 15) 1.3006 + { 1.3007 + uint32_t s = (*src++) | 0xff000000; 1.3008 + uint32_t d = *dst; 1.3009 + 1.3010 + __m128i src = unpack_32_1x128 (s); 1.3011 + __m128i alpha = xmm_alpha; 1.3012 + __m128i mask = xmm_mask; 1.3013 + __m128i dest = unpack_32_1x128 (d); 1.3014 + 1.3015 + *dst++ = pack_1x128_32 ( 1.3016 + in_over_1x128 (&src, &alpha, &mask, &dest)); 1.3017 + 1.3018 + w--; 1.3019 + } 1.3020 + 1.3021 + while (w >= 4) 1.3022 + { 1.3023 + xmm_src = _mm_or_si128 ( 1.3024 + load_128_unaligned ((__m128i*)src), mask_ff000000); 1.3025 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.3026 + 1.3027 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.3028 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.3029 + 1.3030 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.3031 + &xmm_alpha, &xmm_alpha, 1.3032 + &xmm_mask, &xmm_mask, 1.3033 + &xmm_dst_lo, &xmm_dst_hi); 1.3034 + 1.3035 + save_128_aligned ( 1.3036 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.3037 + 1.3038 + dst += 4; 1.3039 + src += 4; 1.3040 + w -= 4; 1.3041 + 1.3042 + } 1.3043 + 1.3044 + while (w) 1.3045 + { 1.3046 + uint32_t s = (*src++) | 0xff000000; 1.3047 + uint32_t d = *dst; 1.3048 + 1.3049 + __m128i src = unpack_32_1x128 (s); 1.3050 + __m128i alpha = xmm_alpha; 1.3051 + __m128i mask = xmm_mask; 1.3052 + __m128i dest = unpack_32_1x128 (d); 1.3053 + 1.3054 + *dst++ = pack_1x128_32 ( 1.3055 + in_over_1x128 (&src, &alpha, &mask, &dest)); 1.3056 + 1.3057 + w--; 1.3058 + } 1.3059 + } 1.3060 + 1.3061 +} 1.3062 + 1.3063 +static void 1.3064 +sse2_composite_over_8888_8888 (pixman_implementation_t *imp, 1.3065 + pixman_composite_info_t *info) 1.3066 +{ 1.3067 + PIXMAN_COMPOSITE_ARGS (info); 1.3068 + int dst_stride, src_stride; 1.3069 + uint32_t *dst_line, *dst; 1.3070 + uint32_t *src_line, *src; 1.3071 + 1.3072 + PIXMAN_IMAGE_GET_LINE ( 1.3073 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3074 + PIXMAN_IMAGE_GET_LINE ( 1.3075 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.3076 + 1.3077 + dst = dst_line; 1.3078 + src = src_line; 1.3079 + 1.3080 + while (height--) 1.3081 + { 1.3082 + sse2_combine_over_u (imp, op, dst, src, NULL, width); 1.3083 + 1.3084 + dst += dst_stride; 1.3085 + src += src_stride; 1.3086 + } 1.3087 +} 1.3088 + 1.3089 +static force_inline uint16_t 1.3090 +composite_over_8888_0565pixel (uint32_t src, uint16_t dst) 1.3091 +{ 1.3092 + __m128i ms; 1.3093 + 1.3094 + ms = unpack_32_1x128 (src); 1.3095 + return pack_565_32_16 ( 1.3096 + pack_1x128_32 ( 1.3097 + over_1x128 ( 1.3098 + ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); 1.3099 +} 1.3100 + 1.3101 +static void 1.3102 +sse2_composite_over_8888_0565 (pixman_implementation_t *imp, 1.3103 + pixman_composite_info_t *info) 1.3104 +{ 1.3105 + PIXMAN_COMPOSITE_ARGS (info); 1.3106 + uint16_t *dst_line, *dst, d; 1.3107 + uint32_t *src_line, *src, s; 1.3108 + int dst_stride, src_stride; 1.3109 + int32_t w; 1.3110 + 1.3111 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.3112 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.3113 + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 1.3114 + 1.3115 + PIXMAN_IMAGE_GET_LINE ( 1.3116 + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.3117 + PIXMAN_IMAGE_GET_LINE ( 1.3118 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.3119 + 1.3120 + while (height--) 1.3121 + { 1.3122 + dst = dst_line; 1.3123 + src = src_line; 1.3124 + 1.3125 + dst_line += dst_stride; 1.3126 + src_line += src_stride; 1.3127 + w = width; 1.3128 + 1.3129 + /* Align dst on a 16-byte boundary */ 1.3130 + while (w && 1.3131 + ((uintptr_t)dst & 15)) 1.3132 + { 1.3133 + s = *src++; 1.3134 + d = *dst; 1.3135 + 1.3136 + *dst++ = composite_over_8888_0565pixel (s, d); 1.3137 + w--; 1.3138 + } 1.3139 + 1.3140 + /* It's a 8 pixel loop */ 1.3141 + while (w >= 8) 1.3142 + { 1.3143 + /* I'm loading unaligned because I'm not sure 1.3144 + * about the address alignment. 1.3145 + */ 1.3146 + xmm_src = load_128_unaligned ((__m128i*) src); 1.3147 + xmm_dst = load_128_aligned ((__m128i*) dst); 1.3148 + 1.3149 + /* Unpacking */ 1.3150 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.3151 + unpack_565_128_4x128 (xmm_dst, 1.3152 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 1.3153 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.3154 + &xmm_alpha_lo, &xmm_alpha_hi); 1.3155 + 1.3156 + /* I'm loading next 4 pixels from memory 1.3157 + * before to optimze the memory read. 1.3158 + */ 1.3159 + xmm_src = load_128_unaligned ((__m128i*) (src + 4)); 1.3160 + 1.3161 + over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.3162 + &xmm_alpha_lo, &xmm_alpha_hi, 1.3163 + &xmm_dst0, &xmm_dst1); 1.3164 + 1.3165 + /* Unpacking */ 1.3166 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.3167 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.3168 + &xmm_alpha_lo, &xmm_alpha_hi); 1.3169 + 1.3170 + over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.3171 + &xmm_alpha_lo, &xmm_alpha_hi, 1.3172 + &xmm_dst2, &xmm_dst3); 1.3173 + 1.3174 + save_128_aligned ( 1.3175 + (__m128i*)dst, pack_565_4x128_128 ( 1.3176 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 1.3177 + 1.3178 + w -= 8; 1.3179 + dst += 8; 1.3180 + src += 8; 1.3181 + } 1.3182 + 1.3183 + while (w--) 1.3184 + { 1.3185 + s = *src++; 1.3186 + d = *dst; 1.3187 + 1.3188 + *dst++ = composite_over_8888_0565pixel (s, d); 1.3189 + } 1.3190 + } 1.3191 + 1.3192 +} 1.3193 + 1.3194 +static void 1.3195 +sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, 1.3196 + pixman_composite_info_t *info) 1.3197 +{ 1.3198 + PIXMAN_COMPOSITE_ARGS (info); 1.3199 + uint32_t src, srca; 1.3200 + uint32_t *dst_line, *dst; 1.3201 + uint8_t *mask_line, *mask; 1.3202 + int dst_stride, mask_stride; 1.3203 + int32_t w; 1.3204 + uint32_t m, d; 1.3205 + 1.3206 + __m128i xmm_src, xmm_alpha, xmm_def; 1.3207 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.3208 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.3209 + 1.3210 + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 1.3211 + 1.3212 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.3213 + 1.3214 + srca = src >> 24; 1.3215 + if (src == 0) 1.3216 + return; 1.3217 + 1.3218 + PIXMAN_IMAGE_GET_LINE ( 1.3219 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3220 + PIXMAN_IMAGE_GET_LINE ( 1.3221 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.3222 + 1.3223 + xmm_def = create_mask_2x32_128 (src, src); 1.3224 + xmm_src = expand_pixel_32_1x128 (src); 1.3225 + xmm_alpha = expand_alpha_1x128 (xmm_src); 1.3226 + mmx_src = xmm_src; 1.3227 + mmx_alpha = xmm_alpha; 1.3228 + 1.3229 + while (height--) 1.3230 + { 1.3231 + dst = dst_line; 1.3232 + dst_line += dst_stride; 1.3233 + mask = mask_line; 1.3234 + mask_line += mask_stride; 1.3235 + w = width; 1.3236 + 1.3237 + while (w && (uintptr_t)dst & 15) 1.3238 + { 1.3239 + uint8_t m = *mask++; 1.3240 + 1.3241 + if (m) 1.3242 + { 1.3243 + d = *dst; 1.3244 + mmx_mask = expand_pixel_8_1x128 (m); 1.3245 + mmx_dest = unpack_32_1x128 (d); 1.3246 + 1.3247 + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, 1.3248 + &mmx_alpha, 1.3249 + &mmx_mask, 1.3250 + &mmx_dest)); 1.3251 + } 1.3252 + 1.3253 + w--; 1.3254 + dst++; 1.3255 + } 1.3256 + 1.3257 + while (w >= 4) 1.3258 + { 1.3259 + m = *((uint32_t*)mask); 1.3260 + 1.3261 + if (srca == 0xff && m == 0xffffffff) 1.3262 + { 1.3263 + save_128_aligned ((__m128i*)dst, xmm_def); 1.3264 + } 1.3265 + else if (m) 1.3266 + { 1.3267 + xmm_dst = load_128_aligned ((__m128i*) dst); 1.3268 + xmm_mask = unpack_32_1x128 (m); 1.3269 + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 1.3270 + 1.3271 + /* Unpacking */ 1.3272 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.3273 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.3274 + 1.3275 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 1.3276 + &xmm_mask_lo, &xmm_mask_hi); 1.3277 + 1.3278 + in_over_2x128 (&xmm_src, &xmm_src, 1.3279 + &xmm_alpha, &xmm_alpha, 1.3280 + &xmm_mask_lo, &xmm_mask_hi, 1.3281 + &xmm_dst_lo, &xmm_dst_hi); 1.3282 + 1.3283 + save_128_aligned ( 1.3284 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.3285 + } 1.3286 + 1.3287 + w -= 4; 1.3288 + dst += 4; 1.3289 + mask += 4; 1.3290 + } 1.3291 + 1.3292 + while (w) 1.3293 + { 1.3294 + uint8_t m = *mask++; 1.3295 + 1.3296 + if (m) 1.3297 + { 1.3298 + d = *dst; 1.3299 + mmx_mask = expand_pixel_8_1x128 (m); 1.3300 + mmx_dest = unpack_32_1x128 (d); 1.3301 + 1.3302 + *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, 1.3303 + &mmx_alpha, 1.3304 + &mmx_mask, 1.3305 + &mmx_dest)); 1.3306 + } 1.3307 + 1.3308 + w--; 1.3309 + dst++; 1.3310 + } 1.3311 + } 1.3312 + 1.3313 +} 1.3314 + 1.3315 +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) 1.3316 +__attribute__((__force_align_arg_pointer__)) 1.3317 +#endif 1.3318 +static pixman_bool_t 1.3319 +sse2_fill (pixman_implementation_t *imp, 1.3320 + uint32_t * bits, 1.3321 + int stride, 1.3322 + int bpp, 1.3323 + int x, 1.3324 + int y, 1.3325 + int width, 1.3326 + int height, 1.3327 + uint32_t filler) 1.3328 +{ 1.3329 + uint32_t byte_width; 1.3330 + uint8_t *byte_line; 1.3331 + 1.3332 + __m128i xmm_def; 1.3333 + 1.3334 + if (bpp == 8) 1.3335 + { 1.3336 + uint8_t b; 1.3337 + uint16_t w; 1.3338 + 1.3339 + stride = stride * (int) sizeof (uint32_t) / 1; 1.3340 + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 1.3341 + byte_width = width; 1.3342 + stride *= 1; 1.3343 + 1.3344 + b = filler & 0xff; 1.3345 + w = (b << 8) | b; 1.3346 + filler = (w << 16) | w; 1.3347 + } 1.3348 + else if (bpp == 16) 1.3349 + { 1.3350 + stride = stride * (int) sizeof (uint32_t) / 2; 1.3351 + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 1.3352 + byte_width = 2 * width; 1.3353 + stride *= 2; 1.3354 + 1.3355 + filler = (filler & 0xffff) * 0x00010001; 1.3356 + } 1.3357 + else if (bpp == 32) 1.3358 + { 1.3359 + stride = stride * (int) sizeof (uint32_t) / 4; 1.3360 + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 1.3361 + byte_width = 4 * width; 1.3362 + stride *= 4; 1.3363 + } 1.3364 + else 1.3365 + { 1.3366 + return FALSE; 1.3367 + } 1.3368 + 1.3369 + xmm_def = create_mask_2x32_128 (filler, filler); 1.3370 + 1.3371 + while (height--) 1.3372 + { 1.3373 + int w; 1.3374 + uint8_t *d = byte_line; 1.3375 + byte_line += stride; 1.3376 + w = byte_width; 1.3377 + 1.3378 + if (w >= 1 && ((uintptr_t)d & 1)) 1.3379 + { 1.3380 + *(uint8_t *)d = filler; 1.3381 + w -= 1; 1.3382 + d += 1; 1.3383 + } 1.3384 + 1.3385 + while (w >= 2 && ((uintptr_t)d & 3)) 1.3386 + { 1.3387 + *(uint16_t *)d = filler; 1.3388 + w -= 2; 1.3389 + d += 2; 1.3390 + } 1.3391 + 1.3392 + while (w >= 4 && ((uintptr_t)d & 15)) 1.3393 + { 1.3394 + *(uint32_t *)d = filler; 1.3395 + 1.3396 + w -= 4; 1.3397 + d += 4; 1.3398 + } 1.3399 + 1.3400 + while (w >= 128) 1.3401 + { 1.3402 + save_128_aligned ((__m128i*)(d), xmm_def); 1.3403 + save_128_aligned ((__m128i*)(d + 16), xmm_def); 1.3404 + save_128_aligned ((__m128i*)(d + 32), xmm_def); 1.3405 + save_128_aligned ((__m128i*)(d + 48), xmm_def); 1.3406 + save_128_aligned ((__m128i*)(d + 64), xmm_def); 1.3407 + save_128_aligned ((__m128i*)(d + 80), xmm_def); 1.3408 + save_128_aligned ((__m128i*)(d + 96), xmm_def); 1.3409 + save_128_aligned ((__m128i*)(d + 112), xmm_def); 1.3410 + 1.3411 + d += 128; 1.3412 + w -= 128; 1.3413 + } 1.3414 + 1.3415 + if (w >= 64) 1.3416 + { 1.3417 + save_128_aligned ((__m128i*)(d), xmm_def); 1.3418 + save_128_aligned ((__m128i*)(d + 16), xmm_def); 1.3419 + save_128_aligned ((__m128i*)(d + 32), xmm_def); 1.3420 + save_128_aligned ((__m128i*)(d + 48), xmm_def); 1.3421 + 1.3422 + d += 64; 1.3423 + w -= 64; 1.3424 + } 1.3425 + 1.3426 + if (w >= 32) 1.3427 + { 1.3428 + save_128_aligned ((__m128i*)(d), xmm_def); 1.3429 + save_128_aligned ((__m128i*)(d + 16), xmm_def); 1.3430 + 1.3431 + d += 32; 1.3432 + w -= 32; 1.3433 + } 1.3434 + 1.3435 + if (w >= 16) 1.3436 + { 1.3437 + save_128_aligned ((__m128i*)(d), xmm_def); 1.3438 + 1.3439 + d += 16; 1.3440 + w -= 16; 1.3441 + } 1.3442 + 1.3443 + while (w >= 4) 1.3444 + { 1.3445 + *(uint32_t *)d = filler; 1.3446 + 1.3447 + w -= 4; 1.3448 + d += 4; 1.3449 + } 1.3450 + 1.3451 + if (w >= 2) 1.3452 + { 1.3453 + *(uint16_t *)d = filler; 1.3454 + w -= 2; 1.3455 + d += 2; 1.3456 + } 1.3457 + 1.3458 + if (w >= 1) 1.3459 + { 1.3460 + *(uint8_t *)d = filler; 1.3461 + w -= 1; 1.3462 + d += 1; 1.3463 + } 1.3464 + } 1.3465 + 1.3466 + return TRUE; 1.3467 +} 1.3468 + 1.3469 +static void 1.3470 +sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, 1.3471 + pixman_composite_info_t *info) 1.3472 +{ 1.3473 + PIXMAN_COMPOSITE_ARGS (info); 1.3474 + uint32_t src, srca; 1.3475 + uint32_t *dst_line, *dst; 1.3476 + uint8_t *mask_line, *mask; 1.3477 + int dst_stride, mask_stride; 1.3478 + int32_t w; 1.3479 + uint32_t m; 1.3480 + 1.3481 + __m128i xmm_src, xmm_def; 1.3482 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.3483 + 1.3484 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.3485 + 1.3486 + srca = src >> 24; 1.3487 + if (src == 0) 1.3488 + { 1.3489 + sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 1.3490 + PIXMAN_FORMAT_BPP (dest_image->bits.format), 1.3491 + dest_x, dest_y, width, height, 0); 1.3492 + return; 1.3493 + } 1.3494 + 1.3495 + PIXMAN_IMAGE_GET_LINE ( 1.3496 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3497 + PIXMAN_IMAGE_GET_LINE ( 1.3498 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.3499 + 1.3500 + xmm_def = create_mask_2x32_128 (src, src); 1.3501 + xmm_src = expand_pixel_32_1x128 (src); 1.3502 + 1.3503 + while (height--) 1.3504 + { 1.3505 + dst = dst_line; 1.3506 + dst_line += dst_stride; 1.3507 + mask = mask_line; 1.3508 + mask_line += mask_stride; 1.3509 + w = width; 1.3510 + 1.3511 + while (w && (uintptr_t)dst & 15) 1.3512 + { 1.3513 + uint8_t m = *mask++; 1.3514 + 1.3515 + if (m) 1.3516 + { 1.3517 + *dst = pack_1x128_32 ( 1.3518 + pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); 1.3519 + } 1.3520 + else 1.3521 + { 1.3522 + *dst = 0; 1.3523 + } 1.3524 + 1.3525 + w--; 1.3526 + dst++; 1.3527 + } 1.3528 + 1.3529 + while (w >= 4) 1.3530 + { 1.3531 + m = *((uint32_t*)mask); 1.3532 + 1.3533 + if (srca == 0xff && m == 0xffffffff) 1.3534 + { 1.3535 + save_128_aligned ((__m128i*)dst, xmm_def); 1.3536 + } 1.3537 + else if (m) 1.3538 + { 1.3539 + xmm_mask = unpack_32_1x128 (m); 1.3540 + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 1.3541 + 1.3542 + /* Unpacking */ 1.3543 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.3544 + 1.3545 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 1.3546 + &xmm_mask_lo, &xmm_mask_hi); 1.3547 + 1.3548 + pix_multiply_2x128 (&xmm_src, &xmm_src, 1.3549 + &xmm_mask_lo, &xmm_mask_hi, 1.3550 + &xmm_mask_lo, &xmm_mask_hi); 1.3551 + 1.3552 + save_128_aligned ( 1.3553 + (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); 1.3554 + } 1.3555 + else 1.3556 + { 1.3557 + save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); 1.3558 + } 1.3559 + 1.3560 + w -= 4; 1.3561 + dst += 4; 1.3562 + mask += 4; 1.3563 + } 1.3564 + 1.3565 + while (w) 1.3566 + { 1.3567 + uint8_t m = *mask++; 1.3568 + 1.3569 + if (m) 1.3570 + { 1.3571 + *dst = pack_1x128_32 ( 1.3572 + pix_multiply_1x128 ( 1.3573 + xmm_src, expand_pixel_8_1x128 (m))); 1.3574 + } 1.3575 + else 1.3576 + { 1.3577 + *dst = 0; 1.3578 + } 1.3579 + 1.3580 + w--; 1.3581 + dst++; 1.3582 + } 1.3583 + } 1.3584 + 1.3585 +} 1.3586 + 1.3587 +static void 1.3588 +sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, 1.3589 + pixman_composite_info_t *info) 1.3590 +{ 1.3591 + PIXMAN_COMPOSITE_ARGS (info); 1.3592 + uint32_t src; 1.3593 + uint16_t *dst_line, *dst, d; 1.3594 + uint8_t *mask_line, *mask; 1.3595 + int dst_stride, mask_stride; 1.3596 + int32_t w; 1.3597 + uint32_t m; 1.3598 + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 1.3599 + 1.3600 + __m128i xmm_src, xmm_alpha; 1.3601 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.3602 + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 1.3603 + 1.3604 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.3605 + 1.3606 + if (src == 0) 1.3607 + return; 1.3608 + 1.3609 + PIXMAN_IMAGE_GET_LINE ( 1.3610 + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.3611 + PIXMAN_IMAGE_GET_LINE ( 1.3612 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.3613 + 1.3614 + xmm_src = expand_pixel_32_1x128 (src); 1.3615 + xmm_alpha = expand_alpha_1x128 (xmm_src); 1.3616 + mmx_src = xmm_src; 1.3617 + mmx_alpha = xmm_alpha; 1.3618 + 1.3619 + while (height--) 1.3620 + { 1.3621 + dst = dst_line; 1.3622 + dst_line += dst_stride; 1.3623 + mask = mask_line; 1.3624 + mask_line += mask_stride; 1.3625 + w = width; 1.3626 + 1.3627 + while (w && (uintptr_t)dst & 15) 1.3628 + { 1.3629 + m = *mask++; 1.3630 + 1.3631 + if (m) 1.3632 + { 1.3633 + d = *dst; 1.3634 + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 1.3635 + mmx_dest = expand565_16_1x128 (d); 1.3636 + 1.3637 + *dst = pack_565_32_16 ( 1.3638 + pack_1x128_32 ( 1.3639 + in_over_1x128 ( 1.3640 + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 1.3641 + } 1.3642 + 1.3643 + w--; 1.3644 + dst++; 1.3645 + } 1.3646 + 1.3647 + while (w >= 8) 1.3648 + { 1.3649 + xmm_dst = load_128_aligned ((__m128i*) dst); 1.3650 + unpack_565_128_4x128 (xmm_dst, 1.3651 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 1.3652 + 1.3653 + m = *((uint32_t*)mask); 1.3654 + mask += 4; 1.3655 + 1.3656 + if (m) 1.3657 + { 1.3658 + xmm_mask = unpack_32_1x128 (m); 1.3659 + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 1.3660 + 1.3661 + /* Unpacking */ 1.3662 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.3663 + 1.3664 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 1.3665 + &xmm_mask_lo, &xmm_mask_hi); 1.3666 + 1.3667 + in_over_2x128 (&xmm_src, &xmm_src, 1.3668 + &xmm_alpha, &xmm_alpha, 1.3669 + &xmm_mask_lo, &xmm_mask_hi, 1.3670 + &xmm_dst0, &xmm_dst1); 1.3671 + } 1.3672 + 1.3673 + m = *((uint32_t*)mask); 1.3674 + mask += 4; 1.3675 + 1.3676 + if (m) 1.3677 + { 1.3678 + xmm_mask = unpack_32_1x128 (m); 1.3679 + xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); 1.3680 + 1.3681 + /* Unpacking */ 1.3682 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.3683 + 1.3684 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 1.3685 + &xmm_mask_lo, &xmm_mask_hi); 1.3686 + in_over_2x128 (&xmm_src, &xmm_src, 1.3687 + &xmm_alpha, &xmm_alpha, 1.3688 + &xmm_mask_lo, &xmm_mask_hi, 1.3689 + &xmm_dst2, &xmm_dst3); 1.3690 + } 1.3691 + 1.3692 + save_128_aligned ( 1.3693 + (__m128i*)dst, pack_565_4x128_128 ( 1.3694 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 1.3695 + 1.3696 + w -= 8; 1.3697 + dst += 8; 1.3698 + } 1.3699 + 1.3700 + while (w) 1.3701 + { 1.3702 + m = *mask++; 1.3703 + 1.3704 + if (m) 1.3705 + { 1.3706 + d = *dst; 1.3707 + mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 1.3708 + mmx_dest = expand565_16_1x128 (d); 1.3709 + 1.3710 + *dst = pack_565_32_16 ( 1.3711 + pack_1x128_32 ( 1.3712 + in_over_1x128 ( 1.3713 + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 1.3714 + } 1.3715 + 1.3716 + w--; 1.3717 + dst++; 1.3718 + } 1.3719 + } 1.3720 + 1.3721 +} 1.3722 + 1.3723 +static void 1.3724 +sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, 1.3725 + pixman_composite_info_t *info) 1.3726 +{ 1.3727 + PIXMAN_COMPOSITE_ARGS (info); 1.3728 + uint16_t *dst_line, *dst, d; 1.3729 + uint32_t *src_line, *src, s; 1.3730 + int dst_stride, src_stride; 1.3731 + int32_t w; 1.3732 + uint32_t opaque, zero; 1.3733 + 1.3734 + __m128i ms; 1.3735 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.3736 + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 1.3737 + 1.3738 + PIXMAN_IMAGE_GET_LINE ( 1.3739 + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.3740 + PIXMAN_IMAGE_GET_LINE ( 1.3741 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.3742 + 1.3743 + while (height--) 1.3744 + { 1.3745 + dst = dst_line; 1.3746 + dst_line += dst_stride; 1.3747 + src = src_line; 1.3748 + src_line += src_stride; 1.3749 + w = width; 1.3750 + 1.3751 + while (w && (uintptr_t)dst & 15) 1.3752 + { 1.3753 + s = *src++; 1.3754 + d = *dst; 1.3755 + 1.3756 + ms = unpack_32_1x128 (s); 1.3757 + 1.3758 + *dst++ = pack_565_32_16 ( 1.3759 + pack_1x128_32 ( 1.3760 + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); 1.3761 + w--; 1.3762 + } 1.3763 + 1.3764 + while (w >= 8) 1.3765 + { 1.3766 + /* First round */ 1.3767 + xmm_src = load_128_unaligned ((__m128i*)src); 1.3768 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.3769 + 1.3770 + opaque = is_opaque (xmm_src); 1.3771 + zero = is_zero (xmm_src); 1.3772 + 1.3773 + unpack_565_128_4x128 (xmm_dst, 1.3774 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 1.3775 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.3776 + 1.3777 + /* preload next round*/ 1.3778 + xmm_src = load_128_unaligned ((__m128i*)(src + 4)); 1.3779 + 1.3780 + if (opaque) 1.3781 + { 1.3782 + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 1.3783 + &xmm_dst0, &xmm_dst1); 1.3784 + } 1.3785 + else if (!zero) 1.3786 + { 1.3787 + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 1.3788 + &xmm_dst0, &xmm_dst1); 1.3789 + } 1.3790 + 1.3791 + /* Second round */ 1.3792 + opaque = is_opaque (xmm_src); 1.3793 + zero = is_zero (xmm_src); 1.3794 + 1.3795 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.3796 + 1.3797 + if (opaque) 1.3798 + { 1.3799 + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 1.3800 + &xmm_dst2, &xmm_dst3); 1.3801 + } 1.3802 + else if (!zero) 1.3803 + { 1.3804 + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 1.3805 + &xmm_dst2, &xmm_dst3); 1.3806 + } 1.3807 + 1.3808 + save_128_aligned ( 1.3809 + (__m128i*)dst, pack_565_4x128_128 ( 1.3810 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 1.3811 + 1.3812 + w -= 8; 1.3813 + src += 8; 1.3814 + dst += 8; 1.3815 + } 1.3816 + 1.3817 + while (w) 1.3818 + { 1.3819 + s = *src++; 1.3820 + d = *dst; 1.3821 + 1.3822 + ms = unpack_32_1x128 (s); 1.3823 + 1.3824 + *dst++ = pack_565_32_16 ( 1.3825 + pack_1x128_32 ( 1.3826 + over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); 1.3827 + w--; 1.3828 + } 1.3829 + } 1.3830 + 1.3831 +} 1.3832 + 1.3833 +static void 1.3834 +sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, 1.3835 + pixman_composite_info_t *info) 1.3836 +{ 1.3837 + PIXMAN_COMPOSITE_ARGS (info); 1.3838 + uint32_t *dst_line, *dst, d; 1.3839 + uint32_t *src_line, *src, s; 1.3840 + int dst_stride, src_stride; 1.3841 + int32_t w; 1.3842 + uint32_t opaque, zero; 1.3843 + 1.3844 + __m128i xmm_src_lo, xmm_src_hi; 1.3845 + __m128i xmm_dst_lo, xmm_dst_hi; 1.3846 + 1.3847 + PIXMAN_IMAGE_GET_LINE ( 1.3848 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3849 + PIXMAN_IMAGE_GET_LINE ( 1.3850 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.3851 + 1.3852 + while (height--) 1.3853 + { 1.3854 + dst = dst_line; 1.3855 + dst_line += dst_stride; 1.3856 + src = src_line; 1.3857 + src_line += src_stride; 1.3858 + w = width; 1.3859 + 1.3860 + while (w && (uintptr_t)dst & 15) 1.3861 + { 1.3862 + s = *src++; 1.3863 + d = *dst; 1.3864 + 1.3865 + *dst++ = pack_1x128_32 ( 1.3866 + over_rev_non_pre_1x128 ( 1.3867 + unpack_32_1x128 (s), unpack_32_1x128 (d))); 1.3868 + 1.3869 + w--; 1.3870 + } 1.3871 + 1.3872 + while (w >= 4) 1.3873 + { 1.3874 + xmm_src_hi = load_128_unaligned ((__m128i*)src); 1.3875 + 1.3876 + opaque = is_opaque (xmm_src_hi); 1.3877 + zero = is_zero (xmm_src_hi); 1.3878 + 1.3879 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.3880 + 1.3881 + if (opaque) 1.3882 + { 1.3883 + invert_colors_2x128 (xmm_src_lo, xmm_src_hi, 1.3884 + &xmm_dst_lo, &xmm_dst_hi); 1.3885 + 1.3886 + save_128_aligned ( 1.3887 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.3888 + } 1.3889 + else if (!zero) 1.3890 + { 1.3891 + xmm_dst_hi = load_128_aligned ((__m128i*)dst); 1.3892 + 1.3893 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.3894 + 1.3895 + over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, 1.3896 + &xmm_dst_lo, &xmm_dst_hi); 1.3897 + 1.3898 + save_128_aligned ( 1.3899 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.3900 + } 1.3901 + 1.3902 + w -= 4; 1.3903 + dst += 4; 1.3904 + src += 4; 1.3905 + } 1.3906 + 1.3907 + while (w) 1.3908 + { 1.3909 + s = *src++; 1.3910 + d = *dst; 1.3911 + 1.3912 + *dst++ = pack_1x128_32 ( 1.3913 + over_rev_non_pre_1x128 ( 1.3914 + unpack_32_1x128 (s), unpack_32_1x128 (d))); 1.3915 + 1.3916 + w--; 1.3917 + } 1.3918 + } 1.3919 + 1.3920 +} 1.3921 + 1.3922 +static void 1.3923 +sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, 1.3924 + pixman_composite_info_t *info) 1.3925 +{ 1.3926 + PIXMAN_COMPOSITE_ARGS (info); 1.3927 + uint32_t src; 1.3928 + uint16_t *dst_line, *dst, d; 1.3929 + uint32_t *mask_line, *mask, m; 1.3930 + int dst_stride, mask_stride; 1.3931 + int w; 1.3932 + uint32_t pack_cmp; 1.3933 + 1.3934 + __m128i xmm_src, xmm_alpha; 1.3935 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.3936 + __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; 1.3937 + 1.3938 + __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; 1.3939 + 1.3940 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.3941 + 1.3942 + if (src == 0) 1.3943 + return; 1.3944 + 1.3945 + PIXMAN_IMAGE_GET_LINE ( 1.3946 + dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.3947 + PIXMAN_IMAGE_GET_LINE ( 1.3948 + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1.3949 + 1.3950 + xmm_src = expand_pixel_32_1x128 (src); 1.3951 + xmm_alpha = expand_alpha_1x128 (xmm_src); 1.3952 + mmx_src = xmm_src; 1.3953 + mmx_alpha = xmm_alpha; 1.3954 + 1.3955 + while (height--) 1.3956 + { 1.3957 + w = width; 1.3958 + mask = mask_line; 1.3959 + dst = dst_line; 1.3960 + mask_line += mask_stride; 1.3961 + dst_line += dst_stride; 1.3962 + 1.3963 + while (w && ((uintptr_t)dst & 15)) 1.3964 + { 1.3965 + m = *(uint32_t *) mask; 1.3966 + 1.3967 + if (m) 1.3968 + { 1.3969 + d = *dst; 1.3970 + mmx_mask = unpack_32_1x128 (m); 1.3971 + mmx_dest = expand565_16_1x128 (d); 1.3972 + 1.3973 + *dst = pack_565_32_16 ( 1.3974 + pack_1x128_32 ( 1.3975 + in_over_1x128 ( 1.3976 + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 1.3977 + } 1.3978 + 1.3979 + w--; 1.3980 + dst++; 1.3981 + mask++; 1.3982 + } 1.3983 + 1.3984 + while (w >= 8) 1.3985 + { 1.3986 + /* First round */ 1.3987 + xmm_mask = load_128_unaligned ((__m128i*)mask); 1.3988 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.3989 + 1.3990 + pack_cmp = _mm_movemask_epi8 ( 1.3991 + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 1.3992 + 1.3993 + unpack_565_128_4x128 (xmm_dst, 1.3994 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); 1.3995 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.3996 + 1.3997 + /* preload next round */ 1.3998 + xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); 1.3999 + 1.4000 + /* preload next round */ 1.4001 + if (pack_cmp != 0xffff) 1.4002 + { 1.4003 + in_over_2x128 (&xmm_src, &xmm_src, 1.4004 + &xmm_alpha, &xmm_alpha, 1.4005 + &xmm_mask_lo, &xmm_mask_hi, 1.4006 + &xmm_dst0, &xmm_dst1); 1.4007 + } 1.4008 + 1.4009 + /* Second round */ 1.4010 + pack_cmp = _mm_movemask_epi8 ( 1.4011 + _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); 1.4012 + 1.4013 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.4014 + 1.4015 + if (pack_cmp != 0xffff) 1.4016 + { 1.4017 + in_over_2x128 (&xmm_src, &xmm_src, 1.4018 + &xmm_alpha, &xmm_alpha, 1.4019 + &xmm_mask_lo, &xmm_mask_hi, 1.4020 + &xmm_dst2, &xmm_dst3); 1.4021 + } 1.4022 + 1.4023 + save_128_aligned ( 1.4024 + (__m128i*)dst, pack_565_4x128_128 ( 1.4025 + &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); 1.4026 + 1.4027 + w -= 8; 1.4028 + dst += 8; 1.4029 + mask += 8; 1.4030 + } 1.4031 + 1.4032 + while (w) 1.4033 + { 1.4034 + m = *(uint32_t *) mask; 1.4035 + 1.4036 + if (m) 1.4037 + { 1.4038 + d = *dst; 1.4039 + mmx_mask = unpack_32_1x128 (m); 1.4040 + mmx_dest = expand565_16_1x128 (d); 1.4041 + 1.4042 + *dst = pack_565_32_16 ( 1.4043 + pack_1x128_32 ( 1.4044 + in_over_1x128 ( 1.4045 + &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); 1.4046 + } 1.4047 + 1.4048 + w--; 1.4049 + dst++; 1.4050 + mask++; 1.4051 + } 1.4052 + } 1.4053 + 1.4054 +} 1.4055 + 1.4056 +static void 1.4057 +sse2_composite_in_n_8_8 (pixman_implementation_t *imp, 1.4058 + pixman_composite_info_t *info) 1.4059 +{ 1.4060 + PIXMAN_COMPOSITE_ARGS (info); 1.4061 + uint8_t *dst_line, *dst; 1.4062 + uint8_t *mask_line, *mask; 1.4063 + int dst_stride, mask_stride; 1.4064 + uint32_t d, m; 1.4065 + uint32_t src; 1.4066 + int32_t w; 1.4067 + 1.4068 + __m128i xmm_alpha; 1.4069 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.4070 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.4071 + 1.4072 + PIXMAN_IMAGE_GET_LINE ( 1.4073 + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.4074 + PIXMAN_IMAGE_GET_LINE ( 1.4075 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.4076 + 1.4077 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.4078 + 1.4079 + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 1.4080 + 1.4081 + while (height--) 1.4082 + { 1.4083 + dst = dst_line; 1.4084 + dst_line += dst_stride; 1.4085 + mask = mask_line; 1.4086 + mask_line += mask_stride; 1.4087 + w = width; 1.4088 + 1.4089 + while (w && ((uintptr_t)dst & 15)) 1.4090 + { 1.4091 + m = (uint32_t) *mask++; 1.4092 + d = (uint32_t) *dst; 1.4093 + 1.4094 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4095 + pix_multiply_1x128 ( 1.4096 + pix_multiply_1x128 (xmm_alpha, 1.4097 + unpack_32_1x128 (m)), 1.4098 + unpack_32_1x128 (d))); 1.4099 + w--; 1.4100 + } 1.4101 + 1.4102 + while (w >= 16) 1.4103 + { 1.4104 + xmm_mask = load_128_unaligned ((__m128i*)mask); 1.4105 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.4106 + 1.4107 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.4108 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.4109 + 1.4110 + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 1.4111 + &xmm_mask_lo, &xmm_mask_hi, 1.4112 + &xmm_mask_lo, &xmm_mask_hi); 1.4113 + 1.4114 + pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, 1.4115 + &xmm_dst_lo, &xmm_dst_hi, 1.4116 + &xmm_dst_lo, &xmm_dst_hi); 1.4117 + 1.4118 + save_128_aligned ( 1.4119 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.4120 + 1.4121 + mask += 16; 1.4122 + dst += 16; 1.4123 + w -= 16; 1.4124 + } 1.4125 + 1.4126 + while (w) 1.4127 + { 1.4128 + m = (uint32_t) *mask++; 1.4129 + d = (uint32_t) *dst; 1.4130 + 1.4131 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4132 + pix_multiply_1x128 ( 1.4133 + pix_multiply_1x128 ( 1.4134 + xmm_alpha, unpack_32_1x128 (m)), 1.4135 + unpack_32_1x128 (d))); 1.4136 + w--; 1.4137 + } 1.4138 + } 1.4139 + 1.4140 +} 1.4141 + 1.4142 +static void 1.4143 +sse2_composite_in_n_8 (pixman_implementation_t *imp, 1.4144 + pixman_composite_info_t *info) 1.4145 +{ 1.4146 + PIXMAN_COMPOSITE_ARGS (info); 1.4147 + uint8_t *dst_line, *dst; 1.4148 + int dst_stride; 1.4149 + uint32_t d; 1.4150 + uint32_t src; 1.4151 + int32_t w; 1.4152 + 1.4153 + __m128i xmm_alpha; 1.4154 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.4155 + 1.4156 + PIXMAN_IMAGE_GET_LINE ( 1.4157 + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.4158 + 1.4159 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.4160 + 1.4161 + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 1.4162 + 1.4163 + src = src >> 24; 1.4164 + 1.4165 + if (src == 0xff) 1.4166 + return; 1.4167 + 1.4168 + if (src == 0x00) 1.4169 + { 1.4170 + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 1.4171 + 8, dest_x, dest_y, width, height, src); 1.4172 + 1.4173 + return; 1.4174 + } 1.4175 + 1.4176 + while (height--) 1.4177 + { 1.4178 + dst = dst_line; 1.4179 + dst_line += dst_stride; 1.4180 + w = width; 1.4181 + 1.4182 + while (w && ((uintptr_t)dst & 15)) 1.4183 + { 1.4184 + d = (uint32_t) *dst; 1.4185 + 1.4186 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4187 + pix_multiply_1x128 ( 1.4188 + xmm_alpha, 1.4189 + unpack_32_1x128 (d))); 1.4190 + w--; 1.4191 + } 1.4192 + 1.4193 + while (w >= 16) 1.4194 + { 1.4195 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.4196 + 1.4197 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.4198 + 1.4199 + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 1.4200 + &xmm_dst_lo, &xmm_dst_hi, 1.4201 + &xmm_dst_lo, &xmm_dst_hi); 1.4202 + 1.4203 + save_128_aligned ( 1.4204 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.4205 + 1.4206 + dst += 16; 1.4207 + w -= 16; 1.4208 + } 1.4209 + 1.4210 + while (w) 1.4211 + { 1.4212 + d = (uint32_t) *dst; 1.4213 + 1.4214 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4215 + pix_multiply_1x128 ( 1.4216 + xmm_alpha, 1.4217 + unpack_32_1x128 (d))); 1.4218 + w--; 1.4219 + } 1.4220 + } 1.4221 + 1.4222 +} 1.4223 + 1.4224 +static void 1.4225 +sse2_composite_in_8_8 (pixman_implementation_t *imp, 1.4226 + pixman_composite_info_t *info) 1.4227 +{ 1.4228 + PIXMAN_COMPOSITE_ARGS (info); 1.4229 + uint8_t *dst_line, *dst; 1.4230 + uint8_t *src_line, *src; 1.4231 + int src_stride, dst_stride; 1.4232 + int32_t w; 1.4233 + uint32_t s, d; 1.4234 + 1.4235 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.4236 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.4237 + 1.4238 + PIXMAN_IMAGE_GET_LINE ( 1.4239 + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.4240 + PIXMAN_IMAGE_GET_LINE ( 1.4241 + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 1.4242 + 1.4243 + while (height--) 1.4244 + { 1.4245 + dst = dst_line; 1.4246 + dst_line += dst_stride; 1.4247 + src = src_line; 1.4248 + src_line += src_stride; 1.4249 + w = width; 1.4250 + 1.4251 + while (w && ((uintptr_t)dst & 15)) 1.4252 + { 1.4253 + s = (uint32_t) *src++; 1.4254 + d = (uint32_t) *dst; 1.4255 + 1.4256 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4257 + pix_multiply_1x128 ( 1.4258 + unpack_32_1x128 (s), unpack_32_1x128 (d))); 1.4259 + w--; 1.4260 + } 1.4261 + 1.4262 + while (w >= 16) 1.4263 + { 1.4264 + xmm_src = load_128_unaligned ((__m128i*)src); 1.4265 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.4266 + 1.4267 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.4268 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.4269 + 1.4270 + pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, 1.4271 + &xmm_dst_lo, &xmm_dst_hi, 1.4272 + &xmm_dst_lo, &xmm_dst_hi); 1.4273 + 1.4274 + save_128_aligned ( 1.4275 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.4276 + 1.4277 + src += 16; 1.4278 + dst += 16; 1.4279 + w -= 16; 1.4280 + } 1.4281 + 1.4282 + while (w) 1.4283 + { 1.4284 + s = (uint32_t) *src++; 1.4285 + d = (uint32_t) *dst; 1.4286 + 1.4287 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4288 + pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); 1.4289 + w--; 1.4290 + } 1.4291 + } 1.4292 + 1.4293 +} 1.4294 + 1.4295 +static void 1.4296 +sse2_composite_add_n_8_8 (pixman_implementation_t *imp, 1.4297 + pixman_composite_info_t *info) 1.4298 +{ 1.4299 + PIXMAN_COMPOSITE_ARGS (info); 1.4300 + uint8_t *dst_line, *dst; 1.4301 + uint8_t *mask_line, *mask; 1.4302 + int dst_stride, mask_stride; 1.4303 + int32_t w; 1.4304 + uint32_t src; 1.4305 + uint32_t m, d; 1.4306 + 1.4307 + __m128i xmm_alpha; 1.4308 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.4309 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.4310 + 1.4311 + PIXMAN_IMAGE_GET_LINE ( 1.4312 + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.4313 + PIXMAN_IMAGE_GET_LINE ( 1.4314 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.4315 + 1.4316 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.4317 + 1.4318 + xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); 1.4319 + 1.4320 + while (height--) 1.4321 + { 1.4322 + dst = dst_line; 1.4323 + dst_line += dst_stride; 1.4324 + mask = mask_line; 1.4325 + mask_line += mask_stride; 1.4326 + w = width; 1.4327 + 1.4328 + while (w && ((uintptr_t)dst & 15)) 1.4329 + { 1.4330 + m = (uint32_t) *mask++; 1.4331 + d = (uint32_t) *dst; 1.4332 + 1.4333 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4334 + _mm_adds_epu16 ( 1.4335 + pix_multiply_1x128 ( 1.4336 + xmm_alpha, unpack_32_1x128 (m)), 1.4337 + unpack_32_1x128 (d))); 1.4338 + w--; 1.4339 + } 1.4340 + 1.4341 + while (w >= 16) 1.4342 + { 1.4343 + xmm_mask = load_128_unaligned ((__m128i*)mask); 1.4344 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.4345 + 1.4346 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.4347 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.4348 + 1.4349 + pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, 1.4350 + &xmm_mask_lo, &xmm_mask_hi, 1.4351 + &xmm_mask_lo, &xmm_mask_hi); 1.4352 + 1.4353 + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); 1.4354 + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); 1.4355 + 1.4356 + save_128_aligned ( 1.4357 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.4358 + 1.4359 + mask += 16; 1.4360 + dst += 16; 1.4361 + w -= 16; 1.4362 + } 1.4363 + 1.4364 + while (w) 1.4365 + { 1.4366 + m = (uint32_t) *mask++; 1.4367 + d = (uint32_t) *dst; 1.4368 + 1.4369 + *dst++ = (uint8_t) pack_1x128_32 ( 1.4370 + _mm_adds_epu16 ( 1.4371 + pix_multiply_1x128 ( 1.4372 + xmm_alpha, unpack_32_1x128 (m)), 1.4373 + unpack_32_1x128 (d))); 1.4374 + 1.4375 + w--; 1.4376 + } 1.4377 + } 1.4378 + 1.4379 +} 1.4380 + 1.4381 +static void 1.4382 +sse2_composite_add_n_8 (pixman_implementation_t *imp, 1.4383 + pixman_composite_info_t *info) 1.4384 +{ 1.4385 + PIXMAN_COMPOSITE_ARGS (info); 1.4386 + uint8_t *dst_line, *dst; 1.4387 + int dst_stride; 1.4388 + int32_t w; 1.4389 + uint32_t src; 1.4390 + 1.4391 + __m128i xmm_src; 1.4392 + 1.4393 + PIXMAN_IMAGE_GET_LINE ( 1.4394 + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.4395 + 1.4396 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.4397 + 1.4398 + src >>= 24; 1.4399 + 1.4400 + if (src == 0x00) 1.4401 + return; 1.4402 + 1.4403 + if (src == 0xff) 1.4404 + { 1.4405 + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 1.4406 + 8, dest_x, dest_y, width, height, 0xff); 1.4407 + 1.4408 + return; 1.4409 + } 1.4410 + 1.4411 + src = (src << 24) | (src << 16) | (src << 8) | src; 1.4412 + xmm_src = _mm_set_epi32 (src, src, src, src); 1.4413 + 1.4414 + while (height--) 1.4415 + { 1.4416 + dst = dst_line; 1.4417 + dst_line += dst_stride; 1.4418 + w = width; 1.4419 + 1.4420 + while (w && ((uintptr_t)dst & 15)) 1.4421 + { 1.4422 + *dst = (uint8_t)_mm_cvtsi128_si32 ( 1.4423 + _mm_adds_epu8 ( 1.4424 + xmm_src, 1.4425 + _mm_cvtsi32_si128 (*dst))); 1.4426 + 1.4427 + w--; 1.4428 + dst++; 1.4429 + } 1.4430 + 1.4431 + while (w >= 16) 1.4432 + { 1.4433 + save_128_aligned ( 1.4434 + (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); 1.4435 + 1.4436 + dst += 16; 1.4437 + w -= 16; 1.4438 + } 1.4439 + 1.4440 + while (w) 1.4441 + { 1.4442 + *dst = (uint8_t)_mm_cvtsi128_si32 ( 1.4443 + _mm_adds_epu8 ( 1.4444 + xmm_src, 1.4445 + _mm_cvtsi32_si128 (*dst))); 1.4446 + 1.4447 + w--; 1.4448 + dst++; 1.4449 + } 1.4450 + } 1.4451 + 1.4452 +} 1.4453 + 1.4454 +static void 1.4455 +sse2_composite_add_8_8 (pixman_implementation_t *imp, 1.4456 + pixman_composite_info_t *info) 1.4457 +{ 1.4458 + PIXMAN_COMPOSITE_ARGS (info); 1.4459 + uint8_t *dst_line, *dst; 1.4460 + uint8_t *src_line, *src; 1.4461 + int dst_stride, src_stride; 1.4462 + int32_t w; 1.4463 + uint16_t t; 1.4464 + 1.4465 + PIXMAN_IMAGE_GET_LINE ( 1.4466 + src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 1.4467 + PIXMAN_IMAGE_GET_LINE ( 1.4468 + dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.4469 + 1.4470 + while (height--) 1.4471 + { 1.4472 + dst = dst_line; 1.4473 + src = src_line; 1.4474 + 1.4475 + dst_line += dst_stride; 1.4476 + src_line += src_stride; 1.4477 + w = width; 1.4478 + 1.4479 + /* Small head */ 1.4480 + while (w && (uintptr_t)dst & 3) 1.4481 + { 1.4482 + t = (*dst) + (*src++); 1.4483 + *dst++ = t | (0 - (t >> 8)); 1.4484 + w--; 1.4485 + } 1.4486 + 1.4487 + sse2_combine_add_u (imp, op, 1.4488 + (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); 1.4489 + 1.4490 + /* Small tail */ 1.4491 + dst += w & 0xfffc; 1.4492 + src += w & 0xfffc; 1.4493 + 1.4494 + w &= 3; 1.4495 + 1.4496 + while (w) 1.4497 + { 1.4498 + t = (*dst) + (*src++); 1.4499 + *dst++ = t | (0 - (t >> 8)); 1.4500 + w--; 1.4501 + } 1.4502 + } 1.4503 + 1.4504 +} 1.4505 + 1.4506 +static void 1.4507 +sse2_composite_add_8888_8888 (pixman_implementation_t *imp, 1.4508 + pixman_composite_info_t *info) 1.4509 +{ 1.4510 + PIXMAN_COMPOSITE_ARGS (info); 1.4511 + uint32_t *dst_line, *dst; 1.4512 + uint32_t *src_line, *src; 1.4513 + int dst_stride, src_stride; 1.4514 + 1.4515 + PIXMAN_IMAGE_GET_LINE ( 1.4516 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.4517 + PIXMAN_IMAGE_GET_LINE ( 1.4518 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.4519 + 1.4520 + while (height--) 1.4521 + { 1.4522 + dst = dst_line; 1.4523 + dst_line += dst_stride; 1.4524 + src = src_line; 1.4525 + src_line += src_stride; 1.4526 + 1.4527 + sse2_combine_add_u (imp, op, dst, src, NULL, width); 1.4528 + } 1.4529 +} 1.4530 + 1.4531 +static void 1.4532 +sse2_composite_add_n_8888 (pixman_implementation_t *imp, 1.4533 + pixman_composite_info_t *info) 1.4534 +{ 1.4535 + PIXMAN_COMPOSITE_ARGS (info); 1.4536 + uint32_t *dst_line, *dst, src; 1.4537 + int dst_stride; 1.4538 + 1.4539 + __m128i xmm_src; 1.4540 + 1.4541 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.4542 + 1.4543 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.4544 + if (src == 0) 1.4545 + return; 1.4546 + 1.4547 + if (src == ~0) 1.4548 + { 1.4549 + pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, 1.4550 + dest_x, dest_y, width, height, ~0); 1.4551 + 1.4552 + return; 1.4553 + } 1.4554 + 1.4555 + xmm_src = _mm_set_epi32 (src, src, src, src); 1.4556 + while (height--) 1.4557 + { 1.4558 + int w = width; 1.4559 + uint32_t d; 1.4560 + 1.4561 + dst = dst_line; 1.4562 + dst_line += dst_stride; 1.4563 + 1.4564 + while (w && (unsigned long)dst & 15) 1.4565 + { 1.4566 + d = *dst; 1.4567 + *dst++ = 1.4568 + _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); 1.4569 + w--; 1.4570 + } 1.4571 + 1.4572 + while (w >= 4) 1.4573 + { 1.4574 + save_128_aligned 1.4575 + ((__m128i*)dst, 1.4576 + _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); 1.4577 + 1.4578 + dst += 4; 1.4579 + w -= 4; 1.4580 + } 1.4581 + 1.4582 + while (w--) 1.4583 + { 1.4584 + d = *dst; 1.4585 + *dst++ = 1.4586 + _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, 1.4587 + _mm_cvtsi32_si128 (d))); 1.4588 + } 1.4589 + } 1.4590 +} 1.4591 + 1.4592 +static void 1.4593 +sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, 1.4594 + pixman_composite_info_t *info) 1.4595 +{ 1.4596 + PIXMAN_COMPOSITE_ARGS (info); 1.4597 + uint32_t *dst_line, *dst; 1.4598 + uint8_t *mask_line, *mask; 1.4599 + int dst_stride, mask_stride; 1.4600 + int32_t w; 1.4601 + uint32_t src; 1.4602 + 1.4603 + __m128i xmm_src; 1.4604 + 1.4605 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.4606 + if (src == 0) 1.4607 + return; 1.4608 + xmm_src = expand_pixel_32_1x128 (src); 1.4609 + 1.4610 + PIXMAN_IMAGE_GET_LINE ( 1.4611 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.4612 + PIXMAN_IMAGE_GET_LINE ( 1.4613 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.4614 + 1.4615 + while (height--) 1.4616 + { 1.4617 + dst = dst_line; 1.4618 + dst_line += dst_stride; 1.4619 + mask = mask_line; 1.4620 + mask_line += mask_stride; 1.4621 + w = width; 1.4622 + 1.4623 + while (w && ((unsigned long)dst & 15)) 1.4624 + { 1.4625 + uint8_t m = *mask++; 1.4626 + if (m) 1.4627 + { 1.4628 + *dst = pack_1x128_32 1.4629 + (_mm_adds_epu16 1.4630 + (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), 1.4631 + unpack_32_1x128 (*dst))); 1.4632 + } 1.4633 + dst++; 1.4634 + w--; 1.4635 + } 1.4636 + 1.4637 + while (w >= 4) 1.4638 + { 1.4639 + uint32_t m = *(uint32_t*)mask; 1.4640 + if (m) 1.4641 + { 1.4642 + __m128i xmm_mask_lo, xmm_mask_hi; 1.4643 + __m128i xmm_dst_lo, xmm_dst_hi; 1.4644 + 1.4645 + __m128i xmm_dst = load_128_aligned ((__m128i*)dst); 1.4646 + __m128i xmm_mask = 1.4647 + _mm_unpacklo_epi8 (unpack_32_1x128(m), 1.4648 + _mm_setzero_si128 ()); 1.4649 + 1.4650 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.4651 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.4652 + 1.4653 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, 1.4654 + &xmm_mask_lo, &xmm_mask_hi); 1.4655 + 1.4656 + pix_multiply_2x128 (&xmm_src, &xmm_src, 1.4657 + &xmm_mask_lo, &xmm_mask_hi, 1.4658 + &xmm_mask_lo, &xmm_mask_hi); 1.4659 + 1.4660 + xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); 1.4661 + xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); 1.4662 + 1.4663 + save_128_aligned ( 1.4664 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.4665 + } 1.4666 + 1.4667 + w -= 4; 1.4668 + dst += 4; 1.4669 + mask += 4; 1.4670 + } 1.4671 + 1.4672 + while (w) 1.4673 + { 1.4674 + uint8_t m = *mask++; 1.4675 + if (m) 1.4676 + { 1.4677 + *dst = pack_1x128_32 1.4678 + (_mm_adds_epu16 1.4679 + (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), 1.4680 + unpack_32_1x128 (*dst))); 1.4681 + } 1.4682 + dst++; 1.4683 + w--; 1.4684 + } 1.4685 + } 1.4686 +} 1.4687 + 1.4688 +static pixman_bool_t 1.4689 +sse2_blt (pixman_implementation_t *imp, 1.4690 + uint32_t * src_bits, 1.4691 + uint32_t * dst_bits, 1.4692 + int src_stride, 1.4693 + int dst_stride, 1.4694 + int src_bpp, 1.4695 + int dst_bpp, 1.4696 + int src_x, 1.4697 + int src_y, 1.4698 + int dest_x, 1.4699 + int dest_y, 1.4700 + int width, 1.4701 + int height) 1.4702 +{ 1.4703 + uint8_t * src_bytes; 1.4704 + uint8_t * dst_bytes; 1.4705 + int byte_width; 1.4706 + 1.4707 + if (src_bpp != dst_bpp) 1.4708 + return FALSE; 1.4709 + 1.4710 + if (src_bpp == 16) 1.4711 + { 1.4712 + src_stride = src_stride * (int) sizeof (uint32_t) / 2; 1.4713 + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; 1.4714 + src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); 1.4715 + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 1.4716 + byte_width = 2 * width; 1.4717 + src_stride *= 2; 1.4718 + dst_stride *= 2; 1.4719 + } 1.4720 + else if (src_bpp == 32) 1.4721 + { 1.4722 + src_stride = src_stride * (int) sizeof (uint32_t) / 4; 1.4723 + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; 1.4724 + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); 1.4725 + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 1.4726 + byte_width = 4 * width; 1.4727 + src_stride *= 4; 1.4728 + dst_stride *= 4; 1.4729 + } 1.4730 + else 1.4731 + { 1.4732 + return FALSE; 1.4733 + } 1.4734 + 1.4735 + while (height--) 1.4736 + { 1.4737 + int w; 1.4738 + uint8_t *s = src_bytes; 1.4739 + uint8_t *d = dst_bytes; 1.4740 + src_bytes += src_stride; 1.4741 + dst_bytes += dst_stride; 1.4742 + w = byte_width; 1.4743 + 1.4744 + while (w >= 2 && ((uintptr_t)d & 3)) 1.4745 + { 1.4746 + *(uint16_t *)d = *(uint16_t *)s; 1.4747 + w -= 2; 1.4748 + s += 2; 1.4749 + d += 2; 1.4750 + } 1.4751 + 1.4752 + while (w >= 4 && ((uintptr_t)d & 15)) 1.4753 + { 1.4754 + *(uint32_t *)d = *(uint32_t *)s; 1.4755 + 1.4756 + w -= 4; 1.4757 + s += 4; 1.4758 + d += 4; 1.4759 + } 1.4760 + 1.4761 + while (w >= 64) 1.4762 + { 1.4763 + __m128i xmm0, xmm1, xmm2, xmm3; 1.4764 + 1.4765 + xmm0 = load_128_unaligned ((__m128i*)(s)); 1.4766 + xmm1 = load_128_unaligned ((__m128i*)(s + 16)); 1.4767 + xmm2 = load_128_unaligned ((__m128i*)(s + 32)); 1.4768 + xmm3 = load_128_unaligned ((__m128i*)(s + 48)); 1.4769 + 1.4770 + save_128_aligned ((__m128i*)(d), xmm0); 1.4771 + save_128_aligned ((__m128i*)(d + 16), xmm1); 1.4772 + save_128_aligned ((__m128i*)(d + 32), xmm2); 1.4773 + save_128_aligned ((__m128i*)(d + 48), xmm3); 1.4774 + 1.4775 + s += 64; 1.4776 + d += 64; 1.4777 + w -= 64; 1.4778 + } 1.4779 + 1.4780 + while (w >= 16) 1.4781 + { 1.4782 + save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); 1.4783 + 1.4784 + w -= 16; 1.4785 + d += 16; 1.4786 + s += 16; 1.4787 + } 1.4788 + 1.4789 + while (w >= 4) 1.4790 + { 1.4791 + *(uint32_t *)d = *(uint32_t *)s; 1.4792 + 1.4793 + w -= 4; 1.4794 + s += 4; 1.4795 + d += 4; 1.4796 + } 1.4797 + 1.4798 + if (w >= 2) 1.4799 + { 1.4800 + *(uint16_t *)d = *(uint16_t *)s; 1.4801 + w -= 2; 1.4802 + s += 2; 1.4803 + d += 2; 1.4804 + } 1.4805 + } 1.4806 + 1.4807 + return TRUE; 1.4808 +} 1.4809 + 1.4810 +static void 1.4811 +sse2_composite_copy_area (pixman_implementation_t *imp, 1.4812 + pixman_composite_info_t *info) 1.4813 +{ 1.4814 + PIXMAN_COMPOSITE_ARGS (info); 1.4815 + sse2_blt (imp, src_image->bits.bits, 1.4816 + dest_image->bits.bits, 1.4817 + src_image->bits.rowstride, 1.4818 + dest_image->bits.rowstride, 1.4819 + PIXMAN_FORMAT_BPP (src_image->bits.format), 1.4820 + PIXMAN_FORMAT_BPP (dest_image->bits.format), 1.4821 + src_x, src_y, dest_x, dest_y, width, height); 1.4822 +} 1.4823 + 1.4824 +static void 1.4825 +sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, 1.4826 + pixman_composite_info_t *info) 1.4827 +{ 1.4828 + PIXMAN_COMPOSITE_ARGS (info); 1.4829 + uint32_t *src, *src_line, s; 1.4830 + uint32_t *dst, *dst_line, d; 1.4831 + uint8_t *mask, *mask_line; 1.4832 + uint32_t m; 1.4833 + int src_stride, mask_stride, dst_stride; 1.4834 + int32_t w; 1.4835 + __m128i ms; 1.4836 + 1.4837 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.4838 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.4839 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.4840 + 1.4841 + PIXMAN_IMAGE_GET_LINE ( 1.4842 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.4843 + PIXMAN_IMAGE_GET_LINE ( 1.4844 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.4845 + PIXMAN_IMAGE_GET_LINE ( 1.4846 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.4847 + 1.4848 + while (height--) 1.4849 + { 1.4850 + src = src_line; 1.4851 + src_line += src_stride; 1.4852 + dst = dst_line; 1.4853 + dst_line += dst_stride; 1.4854 + mask = mask_line; 1.4855 + mask_line += mask_stride; 1.4856 + 1.4857 + w = width; 1.4858 + 1.4859 + while (w && (uintptr_t)dst & 15) 1.4860 + { 1.4861 + s = 0xff000000 | *src++; 1.4862 + m = (uint32_t) *mask++; 1.4863 + d = *dst; 1.4864 + ms = unpack_32_1x128 (s); 1.4865 + 1.4866 + if (m != 0xff) 1.4867 + { 1.4868 + __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 1.4869 + __m128i md = unpack_32_1x128 (d); 1.4870 + 1.4871 + ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); 1.4872 + } 1.4873 + 1.4874 + *dst++ = pack_1x128_32 (ms); 1.4875 + w--; 1.4876 + } 1.4877 + 1.4878 + while (w >= 4) 1.4879 + { 1.4880 + m = *(uint32_t*) mask; 1.4881 + xmm_src = _mm_or_si128 ( 1.4882 + load_128_unaligned ((__m128i*)src), mask_ff000000); 1.4883 + 1.4884 + if (m == 0xffffffff) 1.4885 + { 1.4886 + save_128_aligned ((__m128i*)dst, xmm_src); 1.4887 + } 1.4888 + else 1.4889 + { 1.4890 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.4891 + 1.4892 + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 1.4893 + 1.4894 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.4895 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.4896 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.4897 + 1.4898 + expand_alpha_rev_2x128 ( 1.4899 + xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.4900 + 1.4901 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.4902 + &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, 1.4903 + &xmm_dst_lo, &xmm_dst_hi); 1.4904 + 1.4905 + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.4906 + } 1.4907 + 1.4908 + src += 4; 1.4909 + dst += 4; 1.4910 + mask += 4; 1.4911 + w -= 4; 1.4912 + } 1.4913 + 1.4914 + while (w) 1.4915 + { 1.4916 + m = (uint32_t) *mask++; 1.4917 + 1.4918 + if (m) 1.4919 + { 1.4920 + s = 0xff000000 | *src; 1.4921 + 1.4922 + if (m == 0xff) 1.4923 + { 1.4924 + *dst = s; 1.4925 + } 1.4926 + else 1.4927 + { 1.4928 + __m128i ma, md, ms; 1.4929 + 1.4930 + d = *dst; 1.4931 + 1.4932 + ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); 1.4933 + md = unpack_32_1x128 (d); 1.4934 + ms = unpack_32_1x128 (s); 1.4935 + 1.4936 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); 1.4937 + } 1.4938 + 1.4939 + } 1.4940 + 1.4941 + src++; 1.4942 + dst++; 1.4943 + w--; 1.4944 + } 1.4945 + } 1.4946 + 1.4947 +} 1.4948 + 1.4949 +static void 1.4950 +sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, 1.4951 + pixman_composite_info_t *info) 1.4952 +{ 1.4953 + PIXMAN_COMPOSITE_ARGS (info); 1.4954 + uint32_t *src, *src_line, s; 1.4955 + uint32_t *dst, *dst_line, d; 1.4956 + uint8_t *mask, *mask_line; 1.4957 + uint32_t m; 1.4958 + int src_stride, mask_stride, dst_stride; 1.4959 + int32_t w; 1.4960 + 1.4961 + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 1.4962 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.4963 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.4964 + 1.4965 + PIXMAN_IMAGE_GET_LINE ( 1.4966 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.4967 + PIXMAN_IMAGE_GET_LINE ( 1.4968 + mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.4969 + PIXMAN_IMAGE_GET_LINE ( 1.4970 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.4971 + 1.4972 + while (height--) 1.4973 + { 1.4974 + src = src_line; 1.4975 + src_line += src_stride; 1.4976 + dst = dst_line; 1.4977 + dst_line += dst_stride; 1.4978 + mask = mask_line; 1.4979 + mask_line += mask_stride; 1.4980 + 1.4981 + w = width; 1.4982 + 1.4983 + while (w && (uintptr_t)dst & 15) 1.4984 + { 1.4985 + uint32_t sa; 1.4986 + 1.4987 + s = *src++; 1.4988 + m = (uint32_t) *mask++; 1.4989 + d = *dst; 1.4990 + 1.4991 + sa = s >> 24; 1.4992 + 1.4993 + if (m) 1.4994 + { 1.4995 + if (sa == 0xff && m == 0xff) 1.4996 + { 1.4997 + *dst = s; 1.4998 + } 1.4999 + else 1.5000 + { 1.5001 + __m128i ms, md, ma, msa; 1.5002 + 1.5003 + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 1.5004 + ms = unpack_32_1x128 (s); 1.5005 + md = unpack_32_1x128 (d); 1.5006 + 1.5007 + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 1.5008 + 1.5009 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 1.5010 + } 1.5011 + } 1.5012 + 1.5013 + dst++; 1.5014 + w--; 1.5015 + } 1.5016 + 1.5017 + while (w >= 4) 1.5018 + { 1.5019 + m = *(uint32_t *) mask; 1.5020 + 1.5021 + if (m) 1.5022 + { 1.5023 + xmm_src = load_128_unaligned ((__m128i*)src); 1.5024 + 1.5025 + if (m == 0xffffffff && is_opaque (xmm_src)) 1.5026 + { 1.5027 + save_128_aligned ((__m128i *)dst, xmm_src); 1.5028 + } 1.5029 + else 1.5030 + { 1.5031 + xmm_dst = load_128_aligned ((__m128i *)dst); 1.5032 + 1.5033 + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 1.5034 + 1.5035 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.5036 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.5037 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.5038 + 1.5039 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 1.5040 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.5041 + 1.5042 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 1.5043 + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 1.5044 + 1.5045 + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.5046 + } 1.5047 + } 1.5048 + 1.5049 + src += 4; 1.5050 + dst += 4; 1.5051 + mask += 4; 1.5052 + w -= 4; 1.5053 + } 1.5054 + 1.5055 + while (w) 1.5056 + { 1.5057 + uint32_t sa; 1.5058 + 1.5059 + s = *src++; 1.5060 + m = (uint32_t) *mask++; 1.5061 + d = *dst; 1.5062 + 1.5063 + sa = s >> 24; 1.5064 + 1.5065 + if (m) 1.5066 + { 1.5067 + if (sa == 0xff && m == 0xff) 1.5068 + { 1.5069 + *dst = s; 1.5070 + } 1.5071 + else 1.5072 + { 1.5073 + __m128i ms, md, ma, msa; 1.5074 + 1.5075 + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 1.5076 + ms = unpack_32_1x128 (s); 1.5077 + md = unpack_32_1x128 (d); 1.5078 + 1.5079 + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 1.5080 + 1.5081 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 1.5082 + } 1.5083 + } 1.5084 + 1.5085 + dst++; 1.5086 + w--; 1.5087 + } 1.5088 + } 1.5089 + 1.5090 +} 1.5091 + 1.5092 +static void 1.5093 +sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, 1.5094 + pixman_composite_info_t *info) 1.5095 +{ 1.5096 + PIXMAN_COMPOSITE_ARGS (info); 1.5097 + uint32_t src; 1.5098 + uint32_t *dst_line, *dst; 1.5099 + __m128i xmm_src; 1.5100 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.5101 + __m128i xmm_dsta_hi, xmm_dsta_lo; 1.5102 + int dst_stride; 1.5103 + int32_t w; 1.5104 + 1.5105 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.5106 + 1.5107 + if (src == 0) 1.5108 + return; 1.5109 + 1.5110 + PIXMAN_IMAGE_GET_LINE ( 1.5111 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.5112 + 1.5113 + xmm_src = expand_pixel_32_1x128 (src); 1.5114 + 1.5115 + while (height--) 1.5116 + { 1.5117 + dst = dst_line; 1.5118 + 1.5119 + dst_line += dst_stride; 1.5120 + w = width; 1.5121 + 1.5122 + while (w && (uintptr_t)dst & 15) 1.5123 + { 1.5124 + __m128i vd; 1.5125 + 1.5126 + vd = unpack_32_1x128 (*dst); 1.5127 + 1.5128 + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), 1.5129 + xmm_src)); 1.5130 + w--; 1.5131 + dst++; 1.5132 + } 1.5133 + 1.5134 + while (w >= 4) 1.5135 + { 1.5136 + __m128i tmp_lo, tmp_hi; 1.5137 + 1.5138 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.5139 + 1.5140 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.5141 + expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); 1.5142 + 1.5143 + tmp_lo = xmm_src; 1.5144 + tmp_hi = xmm_src; 1.5145 + 1.5146 + over_2x128 (&xmm_dst_lo, &xmm_dst_hi, 1.5147 + &xmm_dsta_lo, &xmm_dsta_hi, 1.5148 + &tmp_lo, &tmp_hi); 1.5149 + 1.5150 + save_128_aligned ( 1.5151 + (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); 1.5152 + 1.5153 + w -= 4; 1.5154 + dst += 4; 1.5155 + } 1.5156 + 1.5157 + while (w) 1.5158 + { 1.5159 + __m128i vd; 1.5160 + 1.5161 + vd = unpack_32_1x128 (*dst); 1.5162 + 1.5163 + *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), 1.5164 + xmm_src)); 1.5165 + w--; 1.5166 + dst++; 1.5167 + } 1.5168 + 1.5169 + } 1.5170 + 1.5171 +} 1.5172 + 1.5173 +static void 1.5174 +sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, 1.5175 + pixman_composite_info_t *info) 1.5176 +{ 1.5177 + PIXMAN_COMPOSITE_ARGS (info); 1.5178 + uint32_t *src, *src_line, s; 1.5179 + uint32_t *dst, *dst_line, d; 1.5180 + uint32_t *mask, *mask_line; 1.5181 + uint32_t m; 1.5182 + int src_stride, mask_stride, dst_stride; 1.5183 + int32_t w; 1.5184 + 1.5185 + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 1.5186 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.5187 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.5188 + 1.5189 + PIXMAN_IMAGE_GET_LINE ( 1.5190 + dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.5191 + PIXMAN_IMAGE_GET_LINE ( 1.5192 + mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1.5193 + PIXMAN_IMAGE_GET_LINE ( 1.5194 + src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.5195 + 1.5196 + while (height--) 1.5197 + { 1.5198 + src = src_line; 1.5199 + src_line += src_stride; 1.5200 + dst = dst_line; 1.5201 + dst_line += dst_stride; 1.5202 + mask = mask_line; 1.5203 + mask_line += mask_stride; 1.5204 + 1.5205 + w = width; 1.5206 + 1.5207 + while (w && (uintptr_t)dst & 15) 1.5208 + { 1.5209 + uint32_t sa; 1.5210 + 1.5211 + s = *src++; 1.5212 + m = (*mask++) >> 24; 1.5213 + d = *dst; 1.5214 + 1.5215 + sa = s >> 24; 1.5216 + 1.5217 + if (m) 1.5218 + { 1.5219 + if (sa == 0xff && m == 0xff) 1.5220 + { 1.5221 + *dst = s; 1.5222 + } 1.5223 + else 1.5224 + { 1.5225 + __m128i ms, md, ma, msa; 1.5226 + 1.5227 + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 1.5228 + ms = unpack_32_1x128 (s); 1.5229 + md = unpack_32_1x128 (d); 1.5230 + 1.5231 + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 1.5232 + 1.5233 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 1.5234 + } 1.5235 + } 1.5236 + 1.5237 + dst++; 1.5238 + w--; 1.5239 + } 1.5240 + 1.5241 + while (w >= 4) 1.5242 + { 1.5243 + xmm_mask = load_128_unaligned ((__m128i*)mask); 1.5244 + 1.5245 + if (!is_transparent (xmm_mask)) 1.5246 + { 1.5247 + xmm_src = load_128_unaligned ((__m128i*)src); 1.5248 + 1.5249 + if (is_opaque (xmm_mask) && is_opaque (xmm_src)) 1.5250 + { 1.5251 + save_128_aligned ((__m128i *)dst, xmm_src); 1.5252 + } 1.5253 + else 1.5254 + { 1.5255 + xmm_dst = load_128_aligned ((__m128i *)dst); 1.5256 + 1.5257 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.5258 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.5259 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.5260 + 1.5261 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 1.5262 + expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.5263 + 1.5264 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 1.5265 + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 1.5266 + 1.5267 + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.5268 + } 1.5269 + } 1.5270 + 1.5271 + src += 4; 1.5272 + dst += 4; 1.5273 + mask += 4; 1.5274 + w -= 4; 1.5275 + } 1.5276 + 1.5277 + while (w) 1.5278 + { 1.5279 + uint32_t sa; 1.5280 + 1.5281 + s = *src++; 1.5282 + m = (*mask++) >> 24; 1.5283 + d = *dst; 1.5284 + 1.5285 + sa = s >> 24; 1.5286 + 1.5287 + if (m) 1.5288 + { 1.5289 + if (sa == 0xff && m == 0xff) 1.5290 + { 1.5291 + *dst = s; 1.5292 + } 1.5293 + else 1.5294 + { 1.5295 + __m128i ms, md, ma, msa; 1.5296 + 1.5297 + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 1.5298 + ms = unpack_32_1x128 (s); 1.5299 + md = unpack_32_1x128 (d); 1.5300 + 1.5301 + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 1.5302 + 1.5303 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 1.5304 + } 1.5305 + } 1.5306 + 1.5307 + dst++; 1.5308 + w--; 1.5309 + } 1.5310 + } 1.5311 + 1.5312 +} 1.5313 + 1.5314 +/* A variant of 'sse2_combine_over_u' with minor tweaks */ 1.5315 +static force_inline void 1.5316 +scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, 1.5317 + const uint32_t* ps, 1.5318 + int32_t w, 1.5319 + pixman_fixed_t vx, 1.5320 + pixman_fixed_t unit_x, 1.5321 + pixman_fixed_t src_width_fixed, 1.5322 + pixman_bool_t fully_transparent_src) 1.5323 +{ 1.5324 + uint32_t s, d; 1.5325 + const uint32_t* pm = NULL; 1.5326 + 1.5327 + __m128i xmm_dst_lo, xmm_dst_hi; 1.5328 + __m128i xmm_src_lo, xmm_src_hi; 1.5329 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.5330 + 1.5331 + if (fully_transparent_src) 1.5332 + return; 1.5333 + 1.5334 + /* Align dst on a 16-byte boundary */ 1.5335 + while (w && ((uintptr_t)pd & 15)) 1.5336 + { 1.5337 + d = *pd; 1.5338 + s = combine1 (ps + pixman_fixed_to_int (vx), pm); 1.5339 + vx += unit_x; 1.5340 + while (vx >= 0) 1.5341 + vx -= src_width_fixed; 1.5342 + 1.5343 + *pd++ = core_combine_over_u_pixel_sse2 (s, d); 1.5344 + if (pm) 1.5345 + pm++; 1.5346 + w--; 1.5347 + } 1.5348 + 1.5349 + while (w >= 4) 1.5350 + { 1.5351 + __m128i tmp; 1.5352 + uint32_t tmp1, tmp2, tmp3, tmp4; 1.5353 + 1.5354 + tmp1 = *(ps + pixman_fixed_to_int (vx)); 1.5355 + vx += unit_x; 1.5356 + while (vx >= 0) 1.5357 + vx -= src_width_fixed; 1.5358 + tmp2 = *(ps + pixman_fixed_to_int (vx)); 1.5359 + vx += unit_x; 1.5360 + while (vx >= 0) 1.5361 + vx -= src_width_fixed; 1.5362 + tmp3 = *(ps + pixman_fixed_to_int (vx)); 1.5363 + vx += unit_x; 1.5364 + while (vx >= 0) 1.5365 + vx -= src_width_fixed; 1.5366 + tmp4 = *(ps + pixman_fixed_to_int (vx)); 1.5367 + vx += unit_x; 1.5368 + while (vx >= 0) 1.5369 + vx -= src_width_fixed; 1.5370 + 1.5371 + tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); 1.5372 + 1.5373 + xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); 1.5374 + 1.5375 + if (is_opaque (xmm_src_hi)) 1.5376 + { 1.5377 + save_128_aligned ((__m128i*)pd, xmm_src_hi); 1.5378 + } 1.5379 + else if (!is_zero (xmm_src_hi)) 1.5380 + { 1.5381 + xmm_dst_hi = load_128_aligned ((__m128i*) pd); 1.5382 + 1.5383 + unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); 1.5384 + unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); 1.5385 + 1.5386 + expand_alpha_2x128 ( 1.5387 + xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); 1.5388 + 1.5389 + over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.5390 + &xmm_alpha_lo, &xmm_alpha_hi, 1.5391 + &xmm_dst_lo, &xmm_dst_hi); 1.5392 + 1.5393 + /* rebuid the 4 pixel data and save*/ 1.5394 + save_128_aligned ((__m128i*)pd, 1.5395 + pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.5396 + } 1.5397 + 1.5398 + w -= 4; 1.5399 + pd += 4; 1.5400 + if (pm) 1.5401 + pm += 4; 1.5402 + } 1.5403 + 1.5404 + while (w) 1.5405 + { 1.5406 + d = *pd; 1.5407 + s = combine1 (ps + pixman_fixed_to_int (vx), pm); 1.5408 + vx += unit_x; 1.5409 + while (vx >= 0) 1.5410 + vx -= src_width_fixed; 1.5411 + 1.5412 + *pd++ = core_combine_over_u_pixel_sse2 (s, d); 1.5413 + if (pm) 1.5414 + pm++; 1.5415 + 1.5416 + w--; 1.5417 + } 1.5418 +} 1.5419 + 1.5420 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, 1.5421 + scaled_nearest_scanline_sse2_8888_8888_OVER, 1.5422 + uint32_t, uint32_t, COVER) 1.5423 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, 1.5424 + scaled_nearest_scanline_sse2_8888_8888_OVER, 1.5425 + uint32_t, uint32_t, NONE) 1.5426 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, 1.5427 + scaled_nearest_scanline_sse2_8888_8888_OVER, 1.5428 + uint32_t, uint32_t, PAD) 1.5429 +FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, 1.5430 + scaled_nearest_scanline_sse2_8888_8888_OVER, 1.5431 + uint32_t, uint32_t, NORMAL) 1.5432 + 1.5433 +static force_inline void 1.5434 +scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, 1.5435 + uint32_t * dst, 1.5436 + const uint32_t * src, 1.5437 + int32_t w, 1.5438 + pixman_fixed_t vx, 1.5439 + pixman_fixed_t unit_x, 1.5440 + pixman_fixed_t src_width_fixed, 1.5441 + pixman_bool_t zero_src) 1.5442 +{ 1.5443 + __m128i xmm_mask; 1.5444 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.5445 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.5446 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.5447 + 1.5448 + if (zero_src || (*mask >> 24) == 0) 1.5449 + return; 1.5450 + 1.5451 + xmm_mask = create_mask_16_128 (*mask >> 24); 1.5452 + 1.5453 + while (w && (uintptr_t)dst & 15) 1.5454 + { 1.5455 + uint32_t s = *(src + pixman_fixed_to_int (vx)); 1.5456 + vx += unit_x; 1.5457 + while (vx >= 0) 1.5458 + vx -= src_width_fixed; 1.5459 + 1.5460 + if (s) 1.5461 + { 1.5462 + uint32_t d = *dst; 1.5463 + 1.5464 + __m128i ms = unpack_32_1x128 (s); 1.5465 + __m128i alpha = expand_alpha_1x128 (ms); 1.5466 + __m128i dest = xmm_mask; 1.5467 + __m128i alpha_dst = unpack_32_1x128 (d); 1.5468 + 1.5469 + *dst = pack_1x128_32 ( 1.5470 + in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 1.5471 + } 1.5472 + dst++; 1.5473 + w--; 1.5474 + } 1.5475 + 1.5476 + while (w >= 4) 1.5477 + { 1.5478 + uint32_t tmp1, tmp2, tmp3, tmp4; 1.5479 + 1.5480 + tmp1 = *(src + pixman_fixed_to_int (vx)); 1.5481 + vx += unit_x; 1.5482 + while (vx >= 0) 1.5483 + vx -= src_width_fixed; 1.5484 + tmp2 = *(src + pixman_fixed_to_int (vx)); 1.5485 + vx += unit_x; 1.5486 + while (vx >= 0) 1.5487 + vx -= src_width_fixed; 1.5488 + tmp3 = *(src + pixman_fixed_to_int (vx)); 1.5489 + vx += unit_x; 1.5490 + while (vx >= 0) 1.5491 + vx -= src_width_fixed; 1.5492 + tmp4 = *(src + pixman_fixed_to_int (vx)); 1.5493 + vx += unit_x; 1.5494 + while (vx >= 0) 1.5495 + vx -= src_width_fixed; 1.5496 + 1.5497 + xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); 1.5498 + 1.5499 + if (!is_zero (xmm_src)) 1.5500 + { 1.5501 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.5502 + 1.5503 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.5504 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.5505 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.5506 + &xmm_alpha_lo, &xmm_alpha_hi); 1.5507 + 1.5508 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.5509 + &xmm_alpha_lo, &xmm_alpha_hi, 1.5510 + &xmm_mask, &xmm_mask, 1.5511 + &xmm_dst_lo, &xmm_dst_hi); 1.5512 + 1.5513 + save_128_aligned ( 1.5514 + (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.5515 + } 1.5516 + 1.5517 + dst += 4; 1.5518 + w -= 4; 1.5519 + } 1.5520 + 1.5521 + while (w) 1.5522 + { 1.5523 + uint32_t s = *(src + pixman_fixed_to_int (vx)); 1.5524 + vx += unit_x; 1.5525 + while (vx >= 0) 1.5526 + vx -= src_width_fixed; 1.5527 + 1.5528 + if (s) 1.5529 + { 1.5530 + uint32_t d = *dst; 1.5531 + 1.5532 + __m128i ms = unpack_32_1x128 (s); 1.5533 + __m128i alpha = expand_alpha_1x128 (ms); 1.5534 + __m128i mask = xmm_mask; 1.5535 + __m128i dest = unpack_32_1x128 (d); 1.5536 + 1.5537 + *dst = pack_1x128_32 ( 1.5538 + in_over_1x128 (&ms, &alpha, &mask, &dest)); 1.5539 + } 1.5540 + 1.5541 + dst++; 1.5542 + w--; 1.5543 + } 1.5544 + 1.5545 +} 1.5546 + 1.5547 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, 1.5548 + scaled_nearest_scanline_sse2_8888_n_8888_OVER, 1.5549 + uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) 1.5550 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, 1.5551 + scaled_nearest_scanline_sse2_8888_n_8888_OVER, 1.5552 + uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) 1.5553 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, 1.5554 + scaled_nearest_scanline_sse2_8888_n_8888_OVER, 1.5555 + uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) 1.5556 +FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, 1.5557 + scaled_nearest_scanline_sse2_8888_n_8888_OVER, 1.5558 + uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) 1.5559 + 1.5560 +#define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) 1.5561 + 1.5562 +#define BILINEAR_DECLARE_VARIABLES \ 1.5563 + const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ 1.5564 + const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ 1.5565 + const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\ 1.5566 + const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ 1.5567 + const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\ 1.5568 + const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ 1.5569 + const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ 1.5570 + unit_x, unit_x, unit_x, unit_x); \ 1.5571 + const __m128i xmm_zero = _mm_setzero_si128 (); \ 1.5572 + __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx) 1.5573 + 1.5574 +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ 1.5575 +do { \ 1.5576 + __m128i xmm_wh, xmm_lo, xmm_hi, a; \ 1.5577 + /* fetch 2x2 pixel block into sse2 registers */ \ 1.5578 + __m128i tltr = _mm_loadl_epi64 ( \ 1.5579 + (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ 1.5580 + __m128i blbr = _mm_loadl_epi64 ( \ 1.5581 + (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ 1.5582 + vx += unit_x; \ 1.5583 + /* vertical interpolation */ \ 1.5584 + a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ 1.5585 + xmm_wt), \ 1.5586 + _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ 1.5587 + xmm_wb)); \ 1.5588 + if (BILINEAR_INTERPOLATION_BITS < 8) \ 1.5589 + { \ 1.5590 + /* calculate horizontal weights */ \ 1.5591 + xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \ 1.5592 + _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ 1.5593 + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ 1.5594 + /* horizontal interpolation */ \ 1.5595 + a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ 1.5596 + a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ 1.5597 + } \ 1.5598 + else \ 1.5599 + { \ 1.5600 + /* calculate horizontal weights */ \ 1.5601 + xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \ 1.5602 + _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ 1.5603 + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ 1.5604 + /* horizontal interpolation */ \ 1.5605 + xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ 1.5606 + xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ 1.5607 + a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ 1.5608 + _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ 1.5609 + } \ 1.5610 + /* shift and pack the result */ \ 1.5611 + a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ 1.5612 + a = _mm_packs_epi32 (a, a); \ 1.5613 + a = _mm_packus_epi16 (a, a); \ 1.5614 + pix = _mm_cvtsi128_si32 (a); \ 1.5615 +} while (0) 1.5616 + 1.5617 +#define BILINEAR_SKIP_ONE_PIXEL() \ 1.5618 +do { \ 1.5619 + vx += unit_x; \ 1.5620 + xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ 1.5621 +} while(0) 1.5622 + 1.5623 +static force_inline void 1.5624 +scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, 1.5625 + const uint32_t * mask, 1.5626 + const uint32_t * src_top, 1.5627 + const uint32_t * src_bottom, 1.5628 + int32_t w, 1.5629 + int wt, 1.5630 + int wb, 1.5631 + pixman_fixed_t vx, 1.5632 + pixman_fixed_t unit_x, 1.5633 + pixman_fixed_t max_vx, 1.5634 + pixman_bool_t zero_src) 1.5635 +{ 1.5636 + BILINEAR_DECLARE_VARIABLES; 1.5637 + uint32_t pix1, pix2, pix3, pix4; 1.5638 + 1.5639 + while ((w -= 4) >= 0) 1.5640 + { 1.5641 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5642 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 1.5643 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 1.5644 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 1.5645 + *dst++ = pix1; 1.5646 + *dst++ = pix2; 1.5647 + *dst++ = pix3; 1.5648 + *dst++ = pix4; 1.5649 + } 1.5650 + 1.5651 + if (w & 2) 1.5652 + { 1.5653 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5654 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 1.5655 + *dst++ = pix1; 1.5656 + *dst++ = pix2; 1.5657 + } 1.5658 + 1.5659 + if (w & 1) 1.5660 + { 1.5661 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5662 + *dst = pix1; 1.5663 + } 1.5664 + 1.5665 +} 1.5666 + 1.5667 +/* Add extra NULL argument to the existing bilinear fast paths to indicate 1.5668 + * that we don't need two-pass processing */ 1.5669 + 1.5670 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, 1.5671 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.5672 + uint32_t, uint32_t, uint32_t, 1.5673 + COVER, FLAG_NONE) 1.5674 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, 1.5675 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.5676 + uint32_t, uint32_t, uint32_t, 1.5677 + PAD, FLAG_NONE) 1.5678 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, 1.5679 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.5680 + uint32_t, uint32_t, uint32_t, 1.5681 + NONE, FLAG_NONE) 1.5682 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, 1.5683 + scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, 1.5684 + uint32_t, uint32_t, uint32_t, 1.5685 + NORMAL, FLAG_NONE) 1.5686 + 1.5687 +static force_inline void 1.5688 +scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, 1.5689 + const uint32_t * mask, 1.5690 + const uint32_t * src_top, 1.5691 + const uint32_t * src_bottom, 1.5692 + int32_t w, 1.5693 + int wt, 1.5694 + int wb, 1.5695 + pixman_fixed_t vx, 1.5696 + pixman_fixed_t unit_x, 1.5697 + pixman_fixed_t max_vx, 1.5698 + pixman_bool_t zero_src) 1.5699 +{ 1.5700 + BILINEAR_DECLARE_VARIABLES; 1.5701 + uint32_t pix1, pix2, pix3, pix4; 1.5702 + 1.5703 + while (w && ((uintptr_t)dst & 15)) 1.5704 + { 1.5705 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5706 + 1.5707 + if (pix1) 1.5708 + { 1.5709 + pix2 = *dst; 1.5710 + *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); 1.5711 + } 1.5712 + 1.5713 + w--; 1.5714 + dst++; 1.5715 + } 1.5716 + 1.5717 + while (w >= 4) 1.5718 + { 1.5719 + __m128i xmm_src; 1.5720 + __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; 1.5721 + __m128i xmm_alpha_hi, xmm_alpha_lo; 1.5722 + 1.5723 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5724 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 1.5725 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 1.5726 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 1.5727 + 1.5728 + xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); 1.5729 + 1.5730 + if (!is_zero (xmm_src)) 1.5731 + { 1.5732 + if (is_opaque (xmm_src)) 1.5733 + { 1.5734 + save_128_aligned ((__m128i *)dst, xmm_src); 1.5735 + } 1.5736 + else 1.5737 + { 1.5738 + __m128i xmm_dst = load_128_aligned ((__m128i *)dst); 1.5739 + 1.5740 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.5741 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.5742 + 1.5743 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); 1.5744 + over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, 1.5745 + &xmm_dst_lo, &xmm_dst_hi); 1.5746 + 1.5747 + save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.5748 + } 1.5749 + } 1.5750 + 1.5751 + w -= 4; 1.5752 + dst += 4; 1.5753 + } 1.5754 + 1.5755 + while (w) 1.5756 + { 1.5757 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5758 + 1.5759 + if (pix1) 1.5760 + { 1.5761 + pix2 = *dst; 1.5762 + *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); 1.5763 + } 1.5764 + 1.5765 + w--; 1.5766 + dst++; 1.5767 + } 1.5768 +} 1.5769 + 1.5770 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, 1.5771 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.5772 + uint32_t, uint32_t, uint32_t, 1.5773 + COVER, FLAG_NONE) 1.5774 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, 1.5775 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.5776 + uint32_t, uint32_t, uint32_t, 1.5777 + PAD, FLAG_NONE) 1.5778 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, 1.5779 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.5780 + uint32_t, uint32_t, uint32_t, 1.5781 + NONE, FLAG_NONE) 1.5782 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, 1.5783 + scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, 1.5784 + uint32_t, uint32_t, uint32_t, 1.5785 + NORMAL, FLAG_NONE) 1.5786 + 1.5787 + 1.5788 +/* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented 1.5789 + as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ 1.5790 + 1.5791 +void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) 1.5792 +{ 1.5793 + /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ 1.5794 + while (--width >= 0) 1.5795 + { 1.5796 + *dst = composite_over_8888_0565pixel (*src, *dst); 1.5797 + src++; 1.5798 + dst++; 1.5799 + } 1.5800 +} 1.5801 + 1.5802 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, 1.5803 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.5804 + uint32_t, uint32_t, uint16_t, 1.5805 + COVER, FLAG_NONE) 1.5806 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, 1.5807 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.5808 + uint32_t, uint32_t, uint16_t, 1.5809 + PAD, FLAG_NONE) 1.5810 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, 1.5811 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.5812 + uint32_t, uint32_t, uint16_t, 1.5813 + NONE, FLAG_NONE) 1.5814 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, 1.5815 + scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, 1.5816 + uint32_t, uint32_t, uint16_t, 1.5817 + NORMAL, FLAG_NONE) 1.5818 + 1.5819 +/*****************************/ 1.5820 + 1.5821 +static force_inline void 1.5822 +scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, 1.5823 + const uint8_t * mask, 1.5824 + const uint32_t * src_top, 1.5825 + const uint32_t * src_bottom, 1.5826 + int32_t w, 1.5827 + int wt, 1.5828 + int wb, 1.5829 + pixman_fixed_t vx, 1.5830 + pixman_fixed_t unit_x, 1.5831 + pixman_fixed_t max_vx, 1.5832 + pixman_bool_t zero_src) 1.5833 +{ 1.5834 + BILINEAR_DECLARE_VARIABLES; 1.5835 + uint32_t pix1, pix2, pix3, pix4; 1.5836 + uint32_t m; 1.5837 + 1.5838 + while (w && ((uintptr_t)dst & 15)) 1.5839 + { 1.5840 + uint32_t sa; 1.5841 + 1.5842 + m = (uint32_t) *mask++; 1.5843 + 1.5844 + if (m) 1.5845 + { 1.5846 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5847 + sa = pix1 >> 24; 1.5848 + 1.5849 + if (sa == 0xff && m == 0xff) 1.5850 + { 1.5851 + *dst = pix1; 1.5852 + } 1.5853 + else 1.5854 + { 1.5855 + __m128i ms, md, ma, msa; 1.5856 + 1.5857 + pix2 = *dst; 1.5858 + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 1.5859 + ms = unpack_32_1x128 (pix1); 1.5860 + md = unpack_32_1x128 (pix2); 1.5861 + 1.5862 + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 1.5863 + 1.5864 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 1.5865 + } 1.5866 + } 1.5867 + else 1.5868 + { 1.5869 + BILINEAR_SKIP_ONE_PIXEL (); 1.5870 + } 1.5871 + 1.5872 + w--; 1.5873 + dst++; 1.5874 + } 1.5875 + 1.5876 + while (w >= 4) 1.5877 + { 1.5878 + __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; 1.5879 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.5880 + __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; 1.5881 + 1.5882 + m = *(uint32_t*)mask; 1.5883 + 1.5884 + if (m) 1.5885 + { 1.5886 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5887 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 1.5888 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 1.5889 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 1.5890 + 1.5891 + xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); 1.5892 + 1.5893 + if (m == 0xffffffff && is_opaque (xmm_src)) 1.5894 + { 1.5895 + save_128_aligned ((__m128i *)dst, xmm_src); 1.5896 + } 1.5897 + else 1.5898 + { 1.5899 + xmm_dst = load_128_aligned ((__m128i *)dst); 1.5900 + 1.5901 + xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); 1.5902 + 1.5903 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.5904 + unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); 1.5905 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.5906 + 1.5907 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); 1.5908 + expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); 1.5909 + 1.5910 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, 1.5911 + &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); 1.5912 + 1.5913 + save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.5914 + } 1.5915 + } 1.5916 + else 1.5917 + { 1.5918 + BILINEAR_SKIP_ONE_PIXEL (); 1.5919 + BILINEAR_SKIP_ONE_PIXEL (); 1.5920 + BILINEAR_SKIP_ONE_PIXEL (); 1.5921 + BILINEAR_SKIP_ONE_PIXEL (); 1.5922 + } 1.5923 + 1.5924 + w -= 4; 1.5925 + dst += 4; 1.5926 + mask += 4; 1.5927 + } 1.5928 + 1.5929 + while (w) 1.5930 + { 1.5931 + uint32_t sa; 1.5932 + 1.5933 + m = (uint32_t) *mask++; 1.5934 + 1.5935 + if (m) 1.5936 + { 1.5937 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.5938 + sa = pix1 >> 24; 1.5939 + 1.5940 + if (sa == 0xff && m == 0xff) 1.5941 + { 1.5942 + *dst = pix1; 1.5943 + } 1.5944 + else 1.5945 + { 1.5946 + __m128i ms, md, ma, msa; 1.5947 + 1.5948 + pix2 = *dst; 1.5949 + ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); 1.5950 + ms = unpack_32_1x128 (pix1); 1.5951 + md = unpack_32_1x128 (pix2); 1.5952 + 1.5953 + msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); 1.5954 + 1.5955 + *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); 1.5956 + } 1.5957 + } 1.5958 + else 1.5959 + { 1.5960 + BILINEAR_SKIP_ONE_PIXEL (); 1.5961 + } 1.5962 + 1.5963 + w--; 1.5964 + dst++; 1.5965 + } 1.5966 +} 1.5967 + 1.5968 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, 1.5969 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.5970 + uint32_t, uint8_t, uint32_t, 1.5971 + COVER, FLAG_HAVE_NON_SOLID_MASK) 1.5972 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, 1.5973 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.5974 + uint32_t, uint8_t, uint32_t, 1.5975 + PAD, FLAG_HAVE_NON_SOLID_MASK) 1.5976 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, 1.5977 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.5978 + uint32_t, uint8_t, uint32_t, 1.5979 + NONE, FLAG_HAVE_NON_SOLID_MASK) 1.5980 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, 1.5981 + scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, 1.5982 + uint32_t, uint8_t, uint32_t, 1.5983 + NORMAL, FLAG_HAVE_NON_SOLID_MASK) 1.5984 + 1.5985 +static force_inline void 1.5986 +scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, 1.5987 + const uint32_t * mask, 1.5988 + const uint32_t * src_top, 1.5989 + const uint32_t * src_bottom, 1.5990 + int32_t w, 1.5991 + int wt, 1.5992 + int wb, 1.5993 + pixman_fixed_t vx, 1.5994 + pixman_fixed_t unit_x, 1.5995 + pixman_fixed_t max_vx, 1.5996 + pixman_bool_t zero_src) 1.5997 +{ 1.5998 + BILINEAR_DECLARE_VARIABLES; 1.5999 + uint32_t pix1, pix2, pix3, pix4; 1.6000 + __m128i xmm_mask; 1.6001 + 1.6002 + if (zero_src || (*mask >> 24) == 0) 1.6003 + return; 1.6004 + 1.6005 + xmm_mask = create_mask_16_128 (*mask >> 24); 1.6006 + 1.6007 + while (w && ((uintptr_t)dst & 15)) 1.6008 + { 1.6009 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.6010 + if (pix1) 1.6011 + { 1.6012 + uint32_t d = *dst; 1.6013 + 1.6014 + __m128i ms = unpack_32_1x128 (pix1); 1.6015 + __m128i alpha = expand_alpha_1x128 (ms); 1.6016 + __m128i dest = xmm_mask; 1.6017 + __m128i alpha_dst = unpack_32_1x128 (d); 1.6018 + 1.6019 + *dst = pack_1x128_32 1.6020 + (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 1.6021 + } 1.6022 + 1.6023 + dst++; 1.6024 + w--; 1.6025 + } 1.6026 + 1.6027 + while (w >= 4) 1.6028 + { 1.6029 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.6030 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); 1.6031 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); 1.6032 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); 1.6033 + 1.6034 + if (pix1 | pix2 | pix3 | pix4) 1.6035 + { 1.6036 + __m128i xmm_src, xmm_src_lo, xmm_src_hi; 1.6037 + __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; 1.6038 + __m128i xmm_alpha_lo, xmm_alpha_hi; 1.6039 + 1.6040 + xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); 1.6041 + 1.6042 + xmm_dst = load_128_aligned ((__m128i*)dst); 1.6043 + 1.6044 + unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); 1.6045 + unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); 1.6046 + expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, 1.6047 + &xmm_alpha_lo, &xmm_alpha_hi); 1.6048 + 1.6049 + in_over_2x128 (&xmm_src_lo, &xmm_src_hi, 1.6050 + &xmm_alpha_lo, &xmm_alpha_hi, 1.6051 + &xmm_mask, &xmm_mask, 1.6052 + &xmm_dst_lo, &xmm_dst_hi); 1.6053 + 1.6054 + save_128_aligned 1.6055 + ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); 1.6056 + } 1.6057 + 1.6058 + dst += 4; 1.6059 + w -= 4; 1.6060 + } 1.6061 + 1.6062 + while (w) 1.6063 + { 1.6064 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.6065 + if (pix1) 1.6066 + { 1.6067 + uint32_t d = *dst; 1.6068 + 1.6069 + __m128i ms = unpack_32_1x128 (pix1); 1.6070 + __m128i alpha = expand_alpha_1x128 (ms); 1.6071 + __m128i dest = xmm_mask; 1.6072 + __m128i alpha_dst = unpack_32_1x128 (d); 1.6073 + 1.6074 + *dst = pack_1x128_32 1.6075 + (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); 1.6076 + } 1.6077 + 1.6078 + dst++; 1.6079 + w--; 1.6080 + } 1.6081 +} 1.6082 + 1.6083 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, 1.6084 + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, 1.6085 + uint32_t, uint32_t, uint32_t, 1.6086 + COVER, FLAG_HAVE_SOLID_MASK) 1.6087 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, 1.6088 + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, 1.6089 + uint32_t, uint32_t, uint32_t, 1.6090 + PAD, FLAG_HAVE_SOLID_MASK) 1.6091 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, 1.6092 + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, 1.6093 + uint32_t, uint32_t, uint32_t, 1.6094 + NONE, FLAG_HAVE_SOLID_MASK) 1.6095 +FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, 1.6096 + scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, 1.6097 + uint32_t, uint32_t, uint32_t, 1.6098 + NORMAL, FLAG_HAVE_SOLID_MASK) 1.6099 + 1.6100 +static const pixman_fast_path_t sse2_fast_paths[] = 1.6101 +{ 1.6102 + /* PIXMAN_OP_OVER */ 1.6103 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), 1.6104 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), 1.6105 + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), 1.6106 + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), 1.6107 + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), 1.6108 + PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), 1.6109 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), 1.6110 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), 1.6111 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), 1.6112 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), 1.6113 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), 1.6114 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), 1.6115 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), 1.6116 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), 1.6117 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), 1.6118 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), 1.6119 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), 1.6120 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), 1.6121 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), 1.6122 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), 1.6123 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), 1.6124 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), 1.6125 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), 1.6126 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), 1.6127 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), 1.6128 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), 1.6129 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), 1.6130 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), 1.6131 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), 1.6132 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), 1.6133 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), 1.6134 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), 1.6135 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), 1.6136 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), 1.6137 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), 1.6138 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), 1.6139 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), 1.6140 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), 1.6141 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), 1.6142 + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), 1.6143 + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), 1.6144 + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), 1.6145 + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), 1.6146 + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), 1.6147 + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), 1.6148 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), 1.6149 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), 1.6150 + 1.6151 + /* PIXMAN_OP_OVER_REVERSE */ 1.6152 + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), 1.6153 + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), 1.6154 + 1.6155 + /* PIXMAN_OP_ADD */ 1.6156 + PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), 1.6157 + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), 1.6158 + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), 1.6159 + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), 1.6160 + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), 1.6161 + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), 1.6162 + PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), 1.6163 + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), 1.6164 + PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), 1.6165 + PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), 1.6166 + PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), 1.6167 + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), 1.6168 + PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), 1.6169 + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), 1.6170 + 1.6171 + /* PIXMAN_OP_SRC */ 1.6172 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), 1.6173 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), 1.6174 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), 1.6175 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), 1.6176 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), 1.6177 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), 1.6178 + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), 1.6179 + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), 1.6180 + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), 1.6181 + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), 1.6182 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), 1.6183 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), 1.6184 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), 1.6185 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), 1.6186 + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), 1.6187 + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), 1.6188 + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), 1.6189 + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), 1.6190 + 1.6191 + /* PIXMAN_OP_IN */ 1.6192 + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), 1.6193 + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), 1.6194 + PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), 1.6195 + 1.6196 + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6197 + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6198 + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.6199 + SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.6200 + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6201 + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6202 + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.6203 + SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.6204 + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6205 + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6206 + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.6207 + SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.6208 + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6209 + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6210 + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.6211 + SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.6212 + 1.6213 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), 1.6214 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), 1.6215 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), 1.6216 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), 1.6217 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), 1.6218 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), 1.6219 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), 1.6220 + SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), 1.6221 + 1.6222 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.6223 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6224 + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6225 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.6226 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6227 + SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6228 + 1.6229 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), 1.6230 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), 1.6231 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), 1.6232 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), 1.6233 + 1.6234 + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), 1.6235 + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), 1.6236 + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), 1.6237 + SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), 1.6238 + 1.6239 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), 1.6240 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), 1.6241 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), 1.6242 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), 1.6243 + 1.6244 + /* and here the needed entries are added to the fast path table */ 1.6245 + 1.6246 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), 1.6247 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), 1.6248 + 1.6249 + { PIXMAN_OP_NONE }, 1.6250 +}; 1.6251 + 1.6252 +static uint32_t * 1.6253 +sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) 1.6254 +{ 1.6255 + int w = iter->width; 1.6256 + __m128i ff000000 = mask_ff000000; 1.6257 + uint32_t *dst = iter->buffer; 1.6258 + uint32_t *src = (uint32_t *)iter->bits; 1.6259 + 1.6260 + iter->bits += iter->stride; 1.6261 + 1.6262 + while (w && ((uintptr_t)dst) & 0x0f) 1.6263 + { 1.6264 + *dst++ = (*src++) | 0xff000000; 1.6265 + w--; 1.6266 + } 1.6267 + 1.6268 + while (w >= 4) 1.6269 + { 1.6270 + save_128_aligned ( 1.6271 + (__m128i *)dst, _mm_or_si128 ( 1.6272 + load_128_unaligned ((__m128i *)src), ff000000)); 1.6273 + 1.6274 + dst += 4; 1.6275 + src += 4; 1.6276 + w -= 4; 1.6277 + } 1.6278 + 1.6279 + while (w) 1.6280 + { 1.6281 + *dst++ = (*src++) | 0xff000000; 1.6282 + w--; 1.6283 + } 1.6284 + 1.6285 + return iter->buffer; 1.6286 +} 1.6287 + 1.6288 +static uint32_t * 1.6289 +sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) 1.6290 +{ 1.6291 + int w = iter->width; 1.6292 + uint32_t *dst = iter->buffer; 1.6293 + uint16_t *src = (uint16_t *)iter->bits; 1.6294 + __m128i ff000000 = mask_ff000000; 1.6295 + 1.6296 + iter->bits += iter->stride; 1.6297 + 1.6298 + while (w && ((uintptr_t)dst) & 0x0f) 1.6299 + { 1.6300 + uint16_t s = *src++; 1.6301 + 1.6302 + *dst++ = convert_0565_to_8888 (s); 1.6303 + w--; 1.6304 + } 1.6305 + 1.6306 + while (w >= 8) 1.6307 + { 1.6308 + __m128i lo, hi, s; 1.6309 + 1.6310 + s = _mm_loadu_si128 ((__m128i *)src); 1.6311 + 1.6312 + lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); 1.6313 + hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); 1.6314 + 1.6315 + save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); 1.6316 + save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); 1.6317 + 1.6318 + dst += 8; 1.6319 + src += 8; 1.6320 + w -= 8; 1.6321 + } 1.6322 + 1.6323 + while (w) 1.6324 + { 1.6325 + uint16_t s = *src++; 1.6326 + 1.6327 + *dst++ = convert_0565_to_8888 (s); 1.6328 + w--; 1.6329 + } 1.6330 + 1.6331 + return iter->buffer; 1.6332 +} 1.6333 + 1.6334 +static uint32_t * 1.6335 +sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) 1.6336 +{ 1.6337 + int w = iter->width; 1.6338 + uint32_t *dst = iter->buffer; 1.6339 + uint8_t *src = iter->bits; 1.6340 + __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; 1.6341 + 1.6342 + iter->bits += iter->stride; 1.6343 + 1.6344 + while (w && (((uintptr_t)dst) & 15)) 1.6345 + { 1.6346 + *dst++ = *(src++) << 24; 1.6347 + w--; 1.6348 + } 1.6349 + 1.6350 + while (w >= 16) 1.6351 + { 1.6352 + xmm0 = _mm_loadu_si128((__m128i *)src); 1.6353 + 1.6354 + xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); 1.6355 + xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); 1.6356 + xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); 1.6357 + xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); 1.6358 + xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); 1.6359 + xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); 1.6360 + 1.6361 + _mm_store_si128(((__m128i *)(dst + 0)), xmm3); 1.6362 + _mm_store_si128(((__m128i *)(dst + 4)), xmm4); 1.6363 + _mm_store_si128(((__m128i *)(dst + 8)), xmm5); 1.6364 + _mm_store_si128(((__m128i *)(dst + 12)), xmm6); 1.6365 + 1.6366 + dst += 16; 1.6367 + src += 16; 1.6368 + w -= 16; 1.6369 + } 1.6370 + 1.6371 + while (w) 1.6372 + { 1.6373 + *dst++ = *(src++) << 24; 1.6374 + w--; 1.6375 + } 1.6376 + 1.6377 + return iter->buffer; 1.6378 +} 1.6379 + 1.6380 +typedef struct 1.6381 +{ 1.6382 + pixman_format_code_t format; 1.6383 + pixman_iter_get_scanline_t get_scanline; 1.6384 +} fetcher_info_t; 1.6385 + 1.6386 +static const fetcher_info_t fetchers[] = 1.6387 +{ 1.6388 + { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, 1.6389 + { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, 1.6390 + { PIXMAN_a8, sse2_fetch_a8 }, 1.6391 + { PIXMAN_null } 1.6392 +}; 1.6393 + 1.6394 +static pixman_bool_t 1.6395 +sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) 1.6396 +{ 1.6397 + pixman_image_t *image = iter->image; 1.6398 + 1.6399 +#define FLAGS \ 1.6400 + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ 1.6401 + FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) 1.6402 + 1.6403 + if ((iter->iter_flags & ITER_NARROW) && 1.6404 + (iter->image_flags & FLAGS) == FLAGS) 1.6405 + { 1.6406 + const fetcher_info_t *f; 1.6407 + 1.6408 + for (f = &fetchers[0]; f->format != PIXMAN_null; f++) 1.6409 + { 1.6410 + if (image->common.extended_format_code == f->format) 1.6411 + { 1.6412 + uint8_t *b = (uint8_t *)image->bits.bits; 1.6413 + int s = image->bits.rowstride * 4; 1.6414 + 1.6415 + iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; 1.6416 + iter->stride = s; 1.6417 + 1.6418 + iter->get_scanline = f->get_scanline; 1.6419 + return TRUE; 1.6420 + } 1.6421 + } 1.6422 + } 1.6423 + 1.6424 + return FALSE; 1.6425 +} 1.6426 + 1.6427 +#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) 1.6428 +__attribute__((__force_align_arg_pointer__)) 1.6429 +#endif 1.6430 +pixman_implementation_t * 1.6431 +_pixman_implementation_create_sse2 (pixman_implementation_t *fallback) 1.6432 +{ 1.6433 + pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); 1.6434 + 1.6435 + /* SSE2 constants */ 1.6436 + mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); 1.6437 + mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); 1.6438 + mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); 1.6439 + mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); 1.6440 + mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); 1.6441 + mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); 1.6442 + mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); 1.6443 + mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); 1.6444 + mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); 1.6445 + mask_0080 = create_mask_16_128 (0x0080); 1.6446 + mask_00ff = create_mask_16_128 (0x00ff); 1.6447 + mask_0101 = create_mask_16_128 (0x0101); 1.6448 + mask_ffff = create_mask_16_128 (0xffff); 1.6449 + mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); 1.6450 + mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); 1.6451 + mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); 1.6452 + mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); 1.6453 + 1.6454 + /* Set up function pointers */ 1.6455 + imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; 1.6456 + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; 1.6457 + imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; 1.6458 + imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; 1.6459 + imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; 1.6460 + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; 1.6461 + imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; 1.6462 + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; 1.6463 + imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; 1.6464 + imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; 1.6465 + 1.6466 + imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; 1.6467 + 1.6468 + imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; 1.6469 + imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; 1.6470 + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; 1.6471 + imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; 1.6472 + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; 1.6473 + imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; 1.6474 + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; 1.6475 + imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; 1.6476 + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; 1.6477 + imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; 1.6478 + imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; 1.6479 + 1.6480 + imp->blt = sse2_blt; 1.6481 + imp->fill = sse2_fill; 1.6482 + 1.6483 + imp->src_iter_init = sse2_src_iter_init; 1.6484 + 1.6485 + return imp; 1.6486 +}