1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/libpixman/src/pixman-mmx.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,4084 @@ 1.4 +/* 1.5 + * Copyright © 2004, 2005 Red Hat, Inc. 1.6 + * Copyright © 2004 Nicholas Miell 1.7 + * Copyright © 2005 Trolltech AS 1.8 + * 1.9 + * Permission to use, copy, modify, distribute, and sell this software and its 1.10 + * documentation for any purpose is hereby granted without fee, provided that 1.11 + * the above copyright notice appear in all copies and that both that 1.12 + * copyright notice and this permission notice appear in supporting 1.13 + * documentation, and that the name of Red Hat not be used in advertising or 1.14 + * publicity pertaining to distribution of the software without specific, 1.15 + * written prior permission. Red Hat makes no representations about the 1.16 + * suitability of this software for any purpose. It is provided "as is" 1.17 + * without express or implied warranty. 1.18 + * 1.19 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS 1.20 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND 1.21 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY 1.22 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 1.23 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN 1.24 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING 1.25 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS 1.26 + * SOFTWARE. 1.27 + * 1.28 + * Author: Søren Sandmann (sandmann@redhat.com) 1.29 + * Minor Improvements: Nicholas Miell (nmiell@gmail.com) 1.30 + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) 1.31 + * 1.32 + * Based on work by Owen Taylor 1.33 + */ 1.34 + 1.35 +#ifdef HAVE_CONFIG_H 1.36 +#include <config.h> 1.37 +#endif 1.38 + 1.39 +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI 1.40 + 1.41 +#ifdef USE_LOONGSON_MMI 1.42 +#include <loongson-mmintrin.h> 1.43 +#else 1.44 +#include <mmintrin.h> 1.45 +#endif 1.46 +#include "pixman-private.h" 1.47 +#include "pixman-combine32.h" 1.48 +#include "pixman-inlines.h" 1.49 + 1.50 +#define no_vERBOSE 1.51 + 1.52 +#ifdef VERBOSE 1.53 +#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) 1.54 +#else 1.55 +#define CHECKPOINT() 1.56 +#endif 1.57 + 1.58 +#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8 1.59 +/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */ 1.60 +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1.61 +_mm_empty (void) 1.62 +{ 1.63 + 1.64 +} 1.65 +#endif 1.66 + 1.67 +#ifdef USE_X86_MMX 1.68 +# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) 1.69 +# include <xmmintrin.h> 1.70 +# else 1.71 +/* We have to compile with -msse to use xmmintrin.h, but that causes SSE 1.72 + * instructions to be generated that we don't want. Just duplicate the 1.73 + * functions we want to use. */ 1.74 +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1.75 +_mm_movemask_pi8 (__m64 __A) 1.76 +{ 1.77 + int ret; 1.78 + 1.79 + asm ("pmovmskb %1, %0\n\t" 1.80 + : "=r" (ret) 1.81 + : "y" (__A) 1.82 + ); 1.83 + 1.84 + return ret; 1.85 +} 1.86 + 1.87 +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1.88 +_mm_mulhi_pu16 (__m64 __A, __m64 __B) 1.89 +{ 1.90 + asm ("pmulhuw %1, %0\n\t" 1.91 + : "+y" (__A) 1.92 + : "y" (__B) 1.93 + ); 1.94 + return __A; 1.95 +} 1.96 + 1.97 +# ifdef __OPTIMIZE__ 1.98 +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) 1.99 +_mm_shuffle_pi16 (__m64 __A, int8_t const __N) 1.100 +{ 1.101 + __m64 ret; 1.102 + 1.103 + asm ("pshufw %2, %1, %0\n\t" 1.104 + : "=y" (ret) 1.105 + : "y" (__A), "K" (__N) 1.106 + ); 1.107 + 1.108 + return ret; 1.109 +} 1.110 +# else 1.111 +# define _mm_shuffle_pi16(A, N) \ 1.112 + ({ \ 1.113 + __m64 ret; \ 1.114 + \ 1.115 + asm ("pshufw %2, %1, %0\n\t" \ 1.116 + : "=y" (ret) \ 1.117 + : "y" (A), "K" ((const int8_t)N) \ 1.118 + ); \ 1.119 + \ 1.120 + ret; \ 1.121 + }) 1.122 +# endif 1.123 +# endif 1.124 +#endif 1.125 + 1.126 +#ifndef _MSC_VER 1.127 +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ 1.128 + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) 1.129 +#endif 1.130 + 1.131 +/* Notes about writing mmx code 1.132 + * 1.133 + * give memory operands as the second operand. If you give it as the 1.134 + * first, gcc will first load it into a register, then use that 1.135 + * register 1.136 + * 1.137 + * ie. use 1.138 + * 1.139 + * _mm_mullo_pi16 (x, mmx_constant); 1.140 + * 1.141 + * not 1.142 + * 1.143 + * _mm_mullo_pi16 (mmx_constant, x); 1.144 + * 1.145 + * Also try to minimize dependencies. i.e. when you need a value, try 1.146 + * to calculate it from a value that was calculated as early as 1.147 + * possible. 1.148 + */ 1.149 + 1.150 +/* --------------- MMX primitives ------------------------------------- */ 1.151 + 1.152 +/* If __m64 is defined as a struct or union, then define M64_MEMBER to be 1.153 + * the name of the member used to access the data. 1.154 + * If __m64 requires using mm_cvt* intrinsics functions to convert between 1.155 + * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. 1.156 + * If __m64 and uint64_t values can just be cast to each other directly, 1.157 + * then define USE_M64_CASTS. 1.158 + * If __m64 is a double datatype, then define USE_M64_DOUBLE. 1.159 + */ 1.160 +#ifdef _MSC_VER 1.161 +# define M64_MEMBER m64_u64 1.162 +#elif defined(__ICC) 1.163 +# define USE_CVT_INTRINSICS 1.164 +#elif defined(USE_LOONGSON_MMI) 1.165 +# define USE_M64_DOUBLE 1.166 +#elif defined(__GNUC__) 1.167 +# define USE_M64_CASTS 1.168 +#elif defined(__SUNPRO_C) 1.169 +# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) 1.170 +/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) 1.171 + * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ 1.172 + * is defined. If it is used, then the mm_cvt* intrinsics must be used. 1.173 + */ 1.174 +# define USE_CVT_INTRINSICS 1.175 +# else 1.176 +/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is 1.177 + * disabled, __m64 is defined as a struct containing "unsigned long long l_". 1.178 + */ 1.179 +# define M64_MEMBER l_ 1.180 +# endif 1.181 +#endif 1.182 + 1.183 +#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) 1.184 +typedef uint64_t mmxdatafield; 1.185 +#else 1.186 +typedef __m64 mmxdatafield; 1.187 +#endif 1.188 + 1.189 +typedef struct 1.190 +{ 1.191 + mmxdatafield mmx_4x00ff; 1.192 + mmxdatafield mmx_4x0080; 1.193 + mmxdatafield mmx_565_rgb; 1.194 + mmxdatafield mmx_565_unpack_multiplier; 1.195 + mmxdatafield mmx_565_pack_multiplier; 1.196 + mmxdatafield mmx_565_r; 1.197 + mmxdatafield mmx_565_g; 1.198 + mmxdatafield mmx_565_b; 1.199 + mmxdatafield mmx_packed_565_rb; 1.200 + mmxdatafield mmx_packed_565_g; 1.201 + mmxdatafield mmx_expand_565_g; 1.202 + mmxdatafield mmx_expand_565_b; 1.203 + mmxdatafield mmx_expand_565_r; 1.204 +#ifndef USE_LOONGSON_MMI 1.205 + mmxdatafield mmx_mask_0; 1.206 + mmxdatafield mmx_mask_1; 1.207 + mmxdatafield mmx_mask_2; 1.208 + mmxdatafield mmx_mask_3; 1.209 +#endif 1.210 + mmxdatafield mmx_full_alpha; 1.211 + mmxdatafield mmx_4x0101; 1.212 + mmxdatafield mmx_ff000000; 1.213 +} mmx_data_t; 1.214 + 1.215 +#if defined(_MSC_VER) 1.216 +# define MMXDATA_INIT(field, val) { val ## UI64 } 1.217 +#elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ 1.218 +# define MMXDATA_INIT(field, val) field = { val ## ULL } 1.219 +#else /* mmxdatafield is an integral type */ 1.220 +# define MMXDATA_INIT(field, val) field = val ## ULL 1.221 +#endif 1.222 + 1.223 +static const mmx_data_t c = 1.224 +{ 1.225 + MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), 1.226 + MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), 1.227 + MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), 1.228 + MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), 1.229 + MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), 1.230 + MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), 1.231 + MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), 1.232 + MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), 1.233 + MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), 1.234 + MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), 1.235 + MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), 1.236 + MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), 1.237 + MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), 1.238 +#ifndef USE_LOONGSON_MMI 1.239 + MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), 1.240 + MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), 1.241 + MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), 1.242 + MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), 1.243 +#endif 1.244 + MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), 1.245 + MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), 1.246 + MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), 1.247 +}; 1.248 + 1.249 +#ifdef USE_CVT_INTRINSICS 1.250 +# define MC(x) to_m64 (c.mmx_ ## x) 1.251 +#elif defined(USE_M64_CASTS) 1.252 +# define MC(x) ((__m64)c.mmx_ ## x) 1.253 +#elif defined(USE_M64_DOUBLE) 1.254 +# define MC(x) (*(__m64 *)&c.mmx_ ## x) 1.255 +#else 1.256 +# define MC(x) c.mmx_ ## x 1.257 +#endif 1.258 + 1.259 +static force_inline __m64 1.260 +to_m64 (uint64_t x) 1.261 +{ 1.262 +#ifdef USE_CVT_INTRINSICS 1.263 + return _mm_cvtsi64_m64 (x); 1.264 +#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ 1.265 + __m64 res; 1.266 + 1.267 + res.M64_MEMBER = x; 1.268 + return res; 1.269 +#elif defined USE_M64_DOUBLE 1.270 + return *(__m64 *)&x; 1.271 +#else /* USE_M64_CASTS */ 1.272 + return (__m64)x; 1.273 +#endif 1.274 +} 1.275 + 1.276 +static force_inline uint64_t 1.277 +to_uint64 (__m64 x) 1.278 +{ 1.279 +#ifdef USE_CVT_INTRINSICS 1.280 + return _mm_cvtm64_si64 (x); 1.281 +#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ 1.282 + uint64_t res = x.M64_MEMBER; 1.283 + return res; 1.284 +#elif defined USE_M64_DOUBLE 1.285 + return *(uint64_t *)&x; 1.286 +#else /* USE_M64_CASTS */ 1.287 + return (uint64_t)x; 1.288 +#endif 1.289 +} 1.290 + 1.291 +static force_inline __m64 1.292 +shift (__m64 v, 1.293 + int s) 1.294 +{ 1.295 + if (s > 0) 1.296 + return _mm_slli_si64 (v, s); 1.297 + else if (s < 0) 1.298 + return _mm_srli_si64 (v, -s); 1.299 + else 1.300 + return v; 1.301 +} 1.302 + 1.303 +static force_inline __m64 1.304 +negate (__m64 mask) 1.305 +{ 1.306 + return _mm_xor_si64 (mask, MC (4x00ff)); 1.307 +} 1.308 + 1.309 +static force_inline __m64 1.310 +pix_multiply (__m64 a, __m64 b) 1.311 +{ 1.312 + __m64 res; 1.313 + 1.314 + res = _mm_mullo_pi16 (a, b); 1.315 + res = _mm_adds_pu16 (res, MC (4x0080)); 1.316 + res = _mm_mulhi_pu16 (res, MC (4x0101)); 1.317 + 1.318 + return res; 1.319 +} 1.320 + 1.321 +static force_inline __m64 1.322 +pix_add (__m64 a, __m64 b) 1.323 +{ 1.324 + return _mm_adds_pu8 (a, b); 1.325 +} 1.326 + 1.327 +static force_inline __m64 1.328 +expand_alpha (__m64 pixel) 1.329 +{ 1.330 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); 1.331 +} 1.332 + 1.333 +static force_inline __m64 1.334 +expand_alpha_rev (__m64 pixel) 1.335 +{ 1.336 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); 1.337 +} 1.338 + 1.339 +static force_inline __m64 1.340 +invert_colors (__m64 pixel) 1.341 +{ 1.342 + return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); 1.343 +} 1.344 + 1.345 +static force_inline __m64 1.346 +over (__m64 src, 1.347 + __m64 srca, 1.348 + __m64 dest) 1.349 +{ 1.350 + return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); 1.351 +} 1.352 + 1.353 +static force_inline __m64 1.354 +over_rev_non_pre (__m64 src, __m64 dest) 1.355 +{ 1.356 + __m64 srca = expand_alpha (src); 1.357 + __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); 1.358 + 1.359 + return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); 1.360 +} 1.361 + 1.362 +static force_inline __m64 1.363 +in (__m64 src, __m64 mask) 1.364 +{ 1.365 + return pix_multiply (src, mask); 1.366 +} 1.367 + 1.368 +#ifndef _MSC_VER 1.369 +static force_inline __m64 1.370 +in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) 1.371 +{ 1.372 + return over (in (src, mask), pix_multiply (srca, mask), dest); 1.373 +} 1.374 + 1.375 +#else 1.376 + 1.377 +#define in_over(src, srca, mask, dest) \ 1.378 + over (in (src, mask), pix_multiply (srca, mask), dest) 1.379 + 1.380 +#endif 1.381 + 1.382 +/* Elemental unaligned loads */ 1.383 + 1.384 +static force_inline __m64 ldq_u(__m64 *p) 1.385 +{ 1.386 +#ifdef USE_X86_MMX 1.387 + /* x86's alignment restrictions are very relaxed. */ 1.388 + return *(__m64 *)p; 1.389 +#elif defined USE_ARM_IWMMXT 1.390 + int align = (uintptr_t)p & 7; 1.391 + __m64 *aligned_p; 1.392 + if (align == 0) 1.393 + return *p; 1.394 + aligned_p = (__m64 *)((uintptr_t)p & ~7); 1.395 + return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); 1.396 +#else 1.397 + struct __una_u64 { __m64 x __attribute__((packed)); }; 1.398 + const struct __una_u64 *ptr = (const struct __una_u64 *) p; 1.399 + return (__m64) ptr->x; 1.400 +#endif 1.401 +} 1.402 + 1.403 +static force_inline uint32_t ldl_u(const uint32_t *p) 1.404 +{ 1.405 +#ifdef USE_X86_MMX 1.406 + /* x86's alignment restrictions are very relaxed. */ 1.407 + return *p; 1.408 +#else 1.409 + struct __una_u32 { uint32_t x __attribute__((packed)); }; 1.410 + const struct __una_u32 *ptr = (const struct __una_u32 *) p; 1.411 + return ptr->x; 1.412 +#endif 1.413 +} 1.414 + 1.415 +static force_inline __m64 1.416 +load (const uint32_t *v) 1.417 +{ 1.418 +#ifdef USE_LOONGSON_MMI 1.419 + __m64 ret; 1.420 + asm ("lwc1 %0, %1\n\t" 1.421 + : "=f" (ret) 1.422 + : "m" (*v) 1.423 + ); 1.424 + return ret; 1.425 +#else 1.426 + return _mm_cvtsi32_si64 (*v); 1.427 +#endif 1.428 +} 1.429 + 1.430 +static force_inline __m64 1.431 +load8888 (const uint32_t *v) 1.432 +{ 1.433 +#ifdef USE_LOONGSON_MMI 1.434 + return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); 1.435 +#else 1.436 + return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); 1.437 +#endif 1.438 +} 1.439 + 1.440 +static force_inline __m64 1.441 +load8888u (const uint32_t *v) 1.442 +{ 1.443 + uint32_t l = ldl_u (v); 1.444 + return load8888 (&l); 1.445 +} 1.446 + 1.447 +static force_inline __m64 1.448 +pack8888 (__m64 lo, __m64 hi) 1.449 +{ 1.450 + return _mm_packs_pu16 (lo, hi); 1.451 +} 1.452 + 1.453 +static force_inline void 1.454 +store (uint32_t *dest, __m64 v) 1.455 +{ 1.456 +#ifdef USE_LOONGSON_MMI 1.457 + asm ("swc1 %1, %0\n\t" 1.458 + : "=m" (*dest) 1.459 + : "f" (v) 1.460 + : "memory" 1.461 + ); 1.462 +#else 1.463 + *dest = _mm_cvtsi64_si32 (v); 1.464 +#endif 1.465 +} 1.466 + 1.467 +static force_inline void 1.468 +store8888 (uint32_t *dest, __m64 v) 1.469 +{ 1.470 + v = pack8888 (v, _mm_setzero_si64 ()); 1.471 + store (dest, v); 1.472 +} 1.473 + 1.474 +static force_inline pixman_bool_t 1.475 +is_equal (__m64 a, __m64 b) 1.476 +{ 1.477 +#ifdef USE_LOONGSON_MMI 1.478 + /* __m64 is double, we can compare directly. */ 1.479 + return a == b; 1.480 +#else 1.481 + return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; 1.482 +#endif 1.483 +} 1.484 + 1.485 +static force_inline pixman_bool_t 1.486 +is_opaque (__m64 v) 1.487 +{ 1.488 +#ifdef USE_LOONGSON_MMI 1.489 + return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); 1.490 +#else 1.491 + __m64 ffs = _mm_cmpeq_pi8 (v, v); 1.492 + return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); 1.493 +#endif 1.494 +} 1.495 + 1.496 +static force_inline pixman_bool_t 1.497 +is_zero (__m64 v) 1.498 +{ 1.499 + return is_equal (v, _mm_setzero_si64 ()); 1.500 +} 1.501 + 1.502 +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into 1.503 + * 1.504 + * 00RR00GG00BB 1.505 + * 1.506 + * --- Expanding 565 in the low word --- 1.507 + * 1.508 + * m = (m << (32 - 3)) | (m << (16 - 5)) | m; 1.509 + * m = m & (01f0003f001f); 1.510 + * m = m * (008404100840); 1.511 + * m = m >> 8; 1.512 + * 1.513 + * Note the trick here - the top word is shifted by another nibble to 1.514 + * avoid it bumping into the middle word 1.515 + */ 1.516 +static force_inline __m64 1.517 +expand565 (__m64 pixel, int pos) 1.518 +{ 1.519 + __m64 p = pixel; 1.520 + __m64 t1, t2; 1.521 + 1.522 + /* move pixel to low 16 bit and zero the rest */ 1.523 +#ifdef USE_LOONGSON_MMI 1.524 + p = loongson_extract_pi16 (p, pos); 1.525 +#else 1.526 + p = shift (shift (p, (3 - pos) * 16), -48); 1.527 +#endif 1.528 + 1.529 + t1 = shift (p, 36 - 11); 1.530 + t2 = shift (p, 16 - 5); 1.531 + 1.532 + p = _mm_or_si64 (t1, p); 1.533 + p = _mm_or_si64 (t2, p); 1.534 + p = _mm_and_si64 (p, MC (565_rgb)); 1.535 + 1.536 + pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); 1.537 + return _mm_srli_pi16 (pixel, 8); 1.538 +} 1.539 + 1.540 +/* Expand 4 16 bit pixels in an mmx register into two mmx registers of 1.541 + * 1.542 + * AARRGGBBRRGGBB 1.543 + */ 1.544 +static force_inline void 1.545 +expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) 1.546 +{ 1.547 + __m64 t0, t1, alpha = _mm_setzero_si64 (); 1.548 + __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); 1.549 + __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); 1.550 + __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); 1.551 + if (full_alpha) 1.552 + alpha = _mm_cmpeq_pi32 (alpha, alpha); 1.553 + 1.554 + /* Replicate high bits into empty low bits. */ 1.555 + r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); 1.556 + g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); 1.557 + b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); 1.558 + 1.559 + r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ 1.560 + g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ 1.561 + b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ 1.562 + 1.563 + t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ 1.564 + t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ 1.565 + 1.566 + *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ 1.567 + *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ 1.568 +} 1.569 + 1.570 +static force_inline __m64 1.571 +expand8888 (__m64 in, int pos) 1.572 +{ 1.573 + if (pos == 0) 1.574 + return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); 1.575 + else 1.576 + return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); 1.577 +} 1.578 + 1.579 +static force_inline __m64 1.580 +expandx888 (__m64 in, int pos) 1.581 +{ 1.582 + return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); 1.583 +} 1.584 + 1.585 +static force_inline void 1.586 +expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) 1.587 +{ 1.588 + __m64 v0, v1; 1.589 + expand_4xpacked565 (vin, &v0, &v1, full_alpha); 1.590 + *vout0 = expand8888 (v0, 0); 1.591 + *vout1 = expand8888 (v0, 1); 1.592 + *vout2 = expand8888 (v1, 0); 1.593 + *vout3 = expand8888 (v1, 1); 1.594 +} 1.595 + 1.596 +static force_inline __m64 1.597 +pack_565 (__m64 pixel, __m64 target, int pos) 1.598 +{ 1.599 + __m64 p = pixel; 1.600 + __m64 t = target; 1.601 + __m64 r, g, b; 1.602 + 1.603 + r = _mm_and_si64 (p, MC (565_r)); 1.604 + g = _mm_and_si64 (p, MC (565_g)); 1.605 + b = _mm_and_si64 (p, MC (565_b)); 1.606 + 1.607 +#ifdef USE_LOONGSON_MMI 1.608 + r = shift (r, -(32 - 8)); 1.609 + g = shift (g, -(16 - 3)); 1.610 + b = shift (b, -(0 + 3)); 1.611 + 1.612 + p = _mm_or_si64 (r, g); 1.613 + p = _mm_or_si64 (p, b); 1.614 + return loongson_insert_pi16 (t, p, pos); 1.615 +#else 1.616 + r = shift (r, -(32 - 8) + pos * 16); 1.617 + g = shift (g, -(16 - 3) + pos * 16); 1.618 + b = shift (b, -(0 + 3) + pos * 16); 1.619 + 1.620 + if (pos == 0) 1.621 + t = _mm_and_si64 (t, MC (mask_0)); 1.622 + else if (pos == 1) 1.623 + t = _mm_and_si64 (t, MC (mask_1)); 1.624 + else if (pos == 2) 1.625 + t = _mm_and_si64 (t, MC (mask_2)); 1.626 + else if (pos == 3) 1.627 + t = _mm_and_si64 (t, MC (mask_3)); 1.628 + 1.629 + p = _mm_or_si64 (r, t); 1.630 + p = _mm_or_si64 (g, p); 1.631 + 1.632 + return _mm_or_si64 (b, p); 1.633 +#endif 1.634 +} 1.635 + 1.636 +static force_inline __m64 1.637 +pack_4xpacked565 (__m64 a, __m64 b) 1.638 +{ 1.639 + __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); 1.640 + __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); 1.641 + 1.642 + __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); 1.643 + __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); 1.644 + 1.645 + __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); 1.646 + __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); 1.647 + 1.648 + t0 = _mm_or_si64 (t0, g0); 1.649 + t1 = _mm_or_si64 (t1, g1); 1.650 + 1.651 + t0 = shift(t0, -5); 1.652 +#ifdef USE_ARM_IWMMXT 1.653 + t1 = shift(t1, -5); 1.654 + return _mm_packs_pu32 (t0, t1); 1.655 +#else 1.656 + t1 = shift(t1, -5 + 16); 1.657 + return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); 1.658 +#endif 1.659 +} 1.660 + 1.661 +#ifndef _MSC_VER 1.662 + 1.663 +static force_inline __m64 1.664 +pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) 1.665 +{ 1.666 + return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); 1.667 +} 1.668 + 1.669 +static force_inline __m64 1.670 +pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) 1.671 +{ 1.672 + x = pix_multiply (x, a); 1.673 + y = pix_multiply (y, b); 1.674 + 1.675 + return pix_add (x, y); 1.676 +} 1.677 + 1.678 +#else 1.679 + 1.680 +/* MSVC only handles a "pass by register" of up to three SSE intrinsics */ 1.681 + 1.682 +#define pack_4x565(v0, v1, v2, v3) \ 1.683 + pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) 1.684 + 1.685 +#define pix_add_mul(x, a, y, b) \ 1.686 + ( x = pix_multiply (x, a), \ 1.687 + y = pix_multiply (y, b), \ 1.688 + pix_add (x, y) ) 1.689 + 1.690 +#endif 1.691 + 1.692 +/* --------------- MMX code patch for fbcompose.c --------------------- */ 1.693 + 1.694 +static force_inline __m64 1.695 +combine (const uint32_t *src, const uint32_t *mask) 1.696 +{ 1.697 + __m64 vsrc = load8888 (src); 1.698 + 1.699 + if (mask) 1.700 + { 1.701 + __m64 m = load8888 (mask); 1.702 + 1.703 + m = expand_alpha (m); 1.704 + vsrc = pix_multiply (vsrc, m); 1.705 + } 1.706 + 1.707 + return vsrc; 1.708 +} 1.709 + 1.710 +static force_inline __m64 1.711 +core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) 1.712 +{ 1.713 + vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); 1.714 + 1.715 + if (is_opaque (vsrc)) 1.716 + { 1.717 + return vsrc; 1.718 + } 1.719 + else if (!is_zero (vsrc)) 1.720 + { 1.721 + return over (vsrc, expand_alpha (vsrc), 1.722 + _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); 1.723 + } 1.724 + 1.725 + return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); 1.726 +} 1.727 + 1.728 +static void 1.729 +mmx_combine_over_u (pixman_implementation_t *imp, 1.730 + pixman_op_t op, 1.731 + uint32_t * dest, 1.732 + const uint32_t * src, 1.733 + const uint32_t * mask, 1.734 + int width) 1.735 +{ 1.736 + const uint32_t *end = dest + width; 1.737 + 1.738 + while (dest < end) 1.739 + { 1.740 + __m64 vsrc = combine (src, mask); 1.741 + 1.742 + if (is_opaque (vsrc)) 1.743 + { 1.744 + store8888 (dest, vsrc); 1.745 + } 1.746 + else if (!is_zero (vsrc)) 1.747 + { 1.748 + __m64 sa = expand_alpha (vsrc); 1.749 + store8888 (dest, over (vsrc, sa, load8888 (dest))); 1.750 + } 1.751 + 1.752 + ++dest; 1.753 + ++src; 1.754 + if (mask) 1.755 + ++mask; 1.756 + } 1.757 + _mm_empty (); 1.758 +} 1.759 + 1.760 +static void 1.761 +mmx_combine_over_reverse_u (pixman_implementation_t *imp, 1.762 + pixman_op_t op, 1.763 + uint32_t * dest, 1.764 + const uint32_t * src, 1.765 + const uint32_t * mask, 1.766 + int width) 1.767 +{ 1.768 + const uint32_t *end = dest + width; 1.769 + 1.770 + while (dest < end) 1.771 + { 1.772 + __m64 d, da; 1.773 + __m64 s = combine (src, mask); 1.774 + 1.775 + d = load8888 (dest); 1.776 + da = expand_alpha (d); 1.777 + store8888 (dest, over (d, da, s)); 1.778 + 1.779 + ++dest; 1.780 + ++src; 1.781 + if (mask) 1.782 + mask++; 1.783 + } 1.784 + _mm_empty (); 1.785 +} 1.786 + 1.787 +static void 1.788 +mmx_combine_in_u (pixman_implementation_t *imp, 1.789 + pixman_op_t op, 1.790 + uint32_t * dest, 1.791 + const uint32_t * src, 1.792 + const uint32_t * mask, 1.793 + int width) 1.794 +{ 1.795 + const uint32_t *end = dest + width; 1.796 + 1.797 + while (dest < end) 1.798 + { 1.799 + __m64 a; 1.800 + __m64 x = combine (src, mask); 1.801 + 1.802 + a = load8888 (dest); 1.803 + a = expand_alpha (a); 1.804 + x = pix_multiply (x, a); 1.805 + 1.806 + store8888 (dest, x); 1.807 + 1.808 + ++dest; 1.809 + ++src; 1.810 + if (mask) 1.811 + mask++; 1.812 + } 1.813 + _mm_empty (); 1.814 +} 1.815 + 1.816 +static void 1.817 +mmx_combine_in_reverse_u (pixman_implementation_t *imp, 1.818 + pixman_op_t op, 1.819 + uint32_t * dest, 1.820 + const uint32_t * src, 1.821 + const uint32_t * mask, 1.822 + int width) 1.823 +{ 1.824 + const uint32_t *end = dest + width; 1.825 + 1.826 + while (dest < end) 1.827 + { 1.828 + __m64 a = combine (src, mask); 1.829 + __m64 x; 1.830 + 1.831 + x = load8888 (dest); 1.832 + a = expand_alpha (a); 1.833 + x = pix_multiply (x, a); 1.834 + store8888 (dest, x); 1.835 + 1.836 + ++dest; 1.837 + ++src; 1.838 + if (mask) 1.839 + mask++; 1.840 + } 1.841 + _mm_empty (); 1.842 +} 1.843 + 1.844 +static void 1.845 +mmx_combine_out_u (pixman_implementation_t *imp, 1.846 + pixman_op_t op, 1.847 + uint32_t * dest, 1.848 + const uint32_t * src, 1.849 + const uint32_t * mask, 1.850 + int width) 1.851 +{ 1.852 + const uint32_t *end = dest + width; 1.853 + 1.854 + while (dest < end) 1.855 + { 1.856 + __m64 a; 1.857 + __m64 x = combine (src, mask); 1.858 + 1.859 + a = load8888 (dest); 1.860 + a = expand_alpha (a); 1.861 + a = negate (a); 1.862 + x = pix_multiply (x, a); 1.863 + store8888 (dest, x); 1.864 + 1.865 + ++dest; 1.866 + ++src; 1.867 + if (mask) 1.868 + mask++; 1.869 + } 1.870 + _mm_empty (); 1.871 +} 1.872 + 1.873 +static void 1.874 +mmx_combine_out_reverse_u (pixman_implementation_t *imp, 1.875 + pixman_op_t op, 1.876 + uint32_t * dest, 1.877 + const uint32_t * src, 1.878 + const uint32_t * mask, 1.879 + int width) 1.880 +{ 1.881 + const uint32_t *end = dest + width; 1.882 + 1.883 + while (dest < end) 1.884 + { 1.885 + __m64 a = combine (src, mask); 1.886 + __m64 x; 1.887 + 1.888 + x = load8888 (dest); 1.889 + a = expand_alpha (a); 1.890 + a = negate (a); 1.891 + x = pix_multiply (x, a); 1.892 + 1.893 + store8888 (dest, x); 1.894 + 1.895 + ++dest; 1.896 + ++src; 1.897 + if (mask) 1.898 + mask++; 1.899 + } 1.900 + _mm_empty (); 1.901 +} 1.902 + 1.903 +static void 1.904 +mmx_combine_atop_u (pixman_implementation_t *imp, 1.905 + pixman_op_t op, 1.906 + uint32_t * dest, 1.907 + const uint32_t * src, 1.908 + const uint32_t * mask, 1.909 + int width) 1.910 +{ 1.911 + const uint32_t *end = dest + width; 1.912 + 1.913 + while (dest < end) 1.914 + { 1.915 + __m64 da, d, sia; 1.916 + __m64 s = combine (src, mask); 1.917 + 1.918 + d = load8888 (dest); 1.919 + sia = expand_alpha (s); 1.920 + sia = negate (sia); 1.921 + da = expand_alpha (d); 1.922 + s = pix_add_mul (s, da, d, sia); 1.923 + store8888 (dest, s); 1.924 + 1.925 + ++dest; 1.926 + ++src; 1.927 + if (mask) 1.928 + mask++; 1.929 + } 1.930 + _mm_empty (); 1.931 +} 1.932 + 1.933 +static void 1.934 +mmx_combine_atop_reverse_u (pixman_implementation_t *imp, 1.935 + pixman_op_t op, 1.936 + uint32_t * dest, 1.937 + const uint32_t * src, 1.938 + const uint32_t * mask, 1.939 + int width) 1.940 +{ 1.941 + const uint32_t *end; 1.942 + 1.943 + end = dest + width; 1.944 + 1.945 + while (dest < end) 1.946 + { 1.947 + __m64 dia, d, sa; 1.948 + __m64 s = combine (src, mask); 1.949 + 1.950 + d = load8888 (dest); 1.951 + sa = expand_alpha (s); 1.952 + dia = expand_alpha (d); 1.953 + dia = negate (dia); 1.954 + s = pix_add_mul (s, dia, d, sa); 1.955 + store8888 (dest, s); 1.956 + 1.957 + ++dest; 1.958 + ++src; 1.959 + if (mask) 1.960 + mask++; 1.961 + } 1.962 + _mm_empty (); 1.963 +} 1.964 + 1.965 +static void 1.966 +mmx_combine_xor_u (pixman_implementation_t *imp, 1.967 + pixman_op_t op, 1.968 + uint32_t * dest, 1.969 + const uint32_t * src, 1.970 + const uint32_t * mask, 1.971 + int width) 1.972 +{ 1.973 + const uint32_t *end = dest + width; 1.974 + 1.975 + while (dest < end) 1.976 + { 1.977 + __m64 dia, d, sia; 1.978 + __m64 s = combine (src, mask); 1.979 + 1.980 + d = load8888 (dest); 1.981 + sia = expand_alpha (s); 1.982 + dia = expand_alpha (d); 1.983 + sia = negate (sia); 1.984 + dia = negate (dia); 1.985 + s = pix_add_mul (s, dia, d, sia); 1.986 + store8888 (dest, s); 1.987 + 1.988 + ++dest; 1.989 + ++src; 1.990 + if (mask) 1.991 + mask++; 1.992 + } 1.993 + _mm_empty (); 1.994 +} 1.995 + 1.996 +static void 1.997 +mmx_combine_add_u (pixman_implementation_t *imp, 1.998 + pixman_op_t op, 1.999 + uint32_t * dest, 1.1000 + const uint32_t * src, 1.1001 + const uint32_t * mask, 1.1002 + int width) 1.1003 +{ 1.1004 + const uint32_t *end = dest + width; 1.1005 + 1.1006 + while (dest < end) 1.1007 + { 1.1008 + __m64 d; 1.1009 + __m64 s = combine (src, mask); 1.1010 + 1.1011 + d = load8888 (dest); 1.1012 + s = pix_add (s, d); 1.1013 + store8888 (dest, s); 1.1014 + 1.1015 + ++dest; 1.1016 + ++src; 1.1017 + if (mask) 1.1018 + mask++; 1.1019 + } 1.1020 + _mm_empty (); 1.1021 +} 1.1022 + 1.1023 +static void 1.1024 +mmx_combine_saturate_u (pixman_implementation_t *imp, 1.1025 + pixman_op_t op, 1.1026 + uint32_t * dest, 1.1027 + const uint32_t * src, 1.1028 + const uint32_t * mask, 1.1029 + int width) 1.1030 +{ 1.1031 + const uint32_t *end = dest + width; 1.1032 + 1.1033 + while (dest < end) 1.1034 + { 1.1035 + uint32_t s, sa, da; 1.1036 + uint32_t d = *dest; 1.1037 + __m64 ms = combine (src, mask); 1.1038 + __m64 md = load8888 (dest); 1.1039 + 1.1040 + store8888(&s, ms); 1.1041 + da = ~d >> 24; 1.1042 + sa = s >> 24; 1.1043 + 1.1044 + if (sa > da) 1.1045 + { 1.1046 + uint32_t quot = DIV_UN8 (da, sa) << 24; 1.1047 + __m64 msa = load8888 ("); 1.1048 + msa = expand_alpha (msa); 1.1049 + ms = pix_multiply (ms, msa); 1.1050 + } 1.1051 + 1.1052 + md = pix_add (md, ms); 1.1053 + store8888 (dest, md); 1.1054 + 1.1055 + ++src; 1.1056 + ++dest; 1.1057 + if (mask) 1.1058 + mask++; 1.1059 + } 1.1060 + _mm_empty (); 1.1061 +} 1.1062 + 1.1063 +static void 1.1064 +mmx_combine_src_ca (pixman_implementation_t *imp, 1.1065 + pixman_op_t op, 1.1066 + uint32_t * dest, 1.1067 + const uint32_t * src, 1.1068 + const uint32_t * mask, 1.1069 + int width) 1.1070 +{ 1.1071 + const uint32_t *end = src + width; 1.1072 + 1.1073 + while (src < end) 1.1074 + { 1.1075 + __m64 a = load8888 (mask); 1.1076 + __m64 s = load8888 (src); 1.1077 + 1.1078 + s = pix_multiply (s, a); 1.1079 + store8888 (dest, s); 1.1080 + 1.1081 + ++src; 1.1082 + ++mask; 1.1083 + ++dest; 1.1084 + } 1.1085 + _mm_empty (); 1.1086 +} 1.1087 + 1.1088 +static void 1.1089 +mmx_combine_over_ca (pixman_implementation_t *imp, 1.1090 + pixman_op_t op, 1.1091 + uint32_t * dest, 1.1092 + const uint32_t * src, 1.1093 + const uint32_t * mask, 1.1094 + int width) 1.1095 +{ 1.1096 + const uint32_t *end = src + width; 1.1097 + 1.1098 + while (src < end) 1.1099 + { 1.1100 + __m64 a = load8888 (mask); 1.1101 + __m64 s = load8888 (src); 1.1102 + __m64 d = load8888 (dest); 1.1103 + __m64 sa = expand_alpha (s); 1.1104 + 1.1105 + store8888 (dest, in_over (s, sa, a, d)); 1.1106 + 1.1107 + ++src; 1.1108 + ++dest; 1.1109 + ++mask; 1.1110 + } 1.1111 + _mm_empty (); 1.1112 +} 1.1113 + 1.1114 +static void 1.1115 +mmx_combine_over_reverse_ca (pixman_implementation_t *imp, 1.1116 + pixman_op_t op, 1.1117 + uint32_t * dest, 1.1118 + const uint32_t * src, 1.1119 + const uint32_t * mask, 1.1120 + int width) 1.1121 +{ 1.1122 + const uint32_t *end = src + width; 1.1123 + 1.1124 + while (src < end) 1.1125 + { 1.1126 + __m64 a = load8888 (mask); 1.1127 + __m64 s = load8888 (src); 1.1128 + __m64 d = load8888 (dest); 1.1129 + __m64 da = expand_alpha (d); 1.1130 + 1.1131 + store8888 (dest, over (d, da, in (s, a))); 1.1132 + 1.1133 + ++src; 1.1134 + ++dest; 1.1135 + ++mask; 1.1136 + } 1.1137 + _mm_empty (); 1.1138 +} 1.1139 + 1.1140 +static void 1.1141 +mmx_combine_in_ca (pixman_implementation_t *imp, 1.1142 + pixman_op_t op, 1.1143 + uint32_t * dest, 1.1144 + const uint32_t * src, 1.1145 + const uint32_t * mask, 1.1146 + int width) 1.1147 +{ 1.1148 + const uint32_t *end = src + width; 1.1149 + 1.1150 + while (src < end) 1.1151 + { 1.1152 + __m64 a = load8888 (mask); 1.1153 + __m64 s = load8888 (src); 1.1154 + __m64 d = load8888 (dest); 1.1155 + __m64 da = expand_alpha (d); 1.1156 + 1.1157 + s = pix_multiply (s, a); 1.1158 + s = pix_multiply (s, da); 1.1159 + store8888 (dest, s); 1.1160 + 1.1161 + ++src; 1.1162 + ++dest; 1.1163 + ++mask; 1.1164 + } 1.1165 + _mm_empty (); 1.1166 +} 1.1167 + 1.1168 +static void 1.1169 +mmx_combine_in_reverse_ca (pixman_implementation_t *imp, 1.1170 + pixman_op_t op, 1.1171 + uint32_t * dest, 1.1172 + const uint32_t * src, 1.1173 + const uint32_t * mask, 1.1174 + int width) 1.1175 +{ 1.1176 + const uint32_t *end = src + width; 1.1177 + 1.1178 + while (src < end) 1.1179 + { 1.1180 + __m64 a = load8888 (mask); 1.1181 + __m64 s = load8888 (src); 1.1182 + __m64 d = load8888 (dest); 1.1183 + __m64 sa = expand_alpha (s); 1.1184 + 1.1185 + a = pix_multiply (a, sa); 1.1186 + d = pix_multiply (d, a); 1.1187 + store8888 (dest, d); 1.1188 + 1.1189 + ++src; 1.1190 + ++dest; 1.1191 + ++mask; 1.1192 + } 1.1193 + _mm_empty (); 1.1194 +} 1.1195 + 1.1196 +static void 1.1197 +mmx_combine_out_ca (pixman_implementation_t *imp, 1.1198 + pixman_op_t op, 1.1199 + uint32_t * dest, 1.1200 + const uint32_t * src, 1.1201 + const uint32_t * mask, 1.1202 + int width) 1.1203 +{ 1.1204 + const uint32_t *end = src + width; 1.1205 + 1.1206 + while (src < end) 1.1207 + { 1.1208 + __m64 a = load8888 (mask); 1.1209 + __m64 s = load8888 (src); 1.1210 + __m64 d = load8888 (dest); 1.1211 + __m64 da = expand_alpha (d); 1.1212 + 1.1213 + da = negate (da); 1.1214 + s = pix_multiply (s, a); 1.1215 + s = pix_multiply (s, da); 1.1216 + store8888 (dest, s); 1.1217 + 1.1218 + ++src; 1.1219 + ++dest; 1.1220 + ++mask; 1.1221 + } 1.1222 + _mm_empty (); 1.1223 +} 1.1224 + 1.1225 +static void 1.1226 +mmx_combine_out_reverse_ca (pixman_implementation_t *imp, 1.1227 + pixman_op_t op, 1.1228 + uint32_t * dest, 1.1229 + const uint32_t * src, 1.1230 + const uint32_t * mask, 1.1231 + int width) 1.1232 +{ 1.1233 + const uint32_t *end = src + width; 1.1234 + 1.1235 + while (src < end) 1.1236 + { 1.1237 + __m64 a = load8888 (mask); 1.1238 + __m64 s = load8888 (src); 1.1239 + __m64 d = load8888 (dest); 1.1240 + __m64 sa = expand_alpha (s); 1.1241 + 1.1242 + a = pix_multiply (a, sa); 1.1243 + a = negate (a); 1.1244 + d = pix_multiply (d, a); 1.1245 + store8888 (dest, d); 1.1246 + 1.1247 + ++src; 1.1248 + ++dest; 1.1249 + ++mask; 1.1250 + } 1.1251 + _mm_empty (); 1.1252 +} 1.1253 + 1.1254 +static void 1.1255 +mmx_combine_atop_ca (pixman_implementation_t *imp, 1.1256 + pixman_op_t op, 1.1257 + uint32_t * dest, 1.1258 + const uint32_t * src, 1.1259 + const uint32_t * mask, 1.1260 + int width) 1.1261 +{ 1.1262 + const uint32_t *end = src + width; 1.1263 + 1.1264 + while (src < end) 1.1265 + { 1.1266 + __m64 a = load8888 (mask); 1.1267 + __m64 s = load8888 (src); 1.1268 + __m64 d = load8888 (dest); 1.1269 + __m64 da = expand_alpha (d); 1.1270 + __m64 sa = expand_alpha (s); 1.1271 + 1.1272 + s = pix_multiply (s, a); 1.1273 + a = pix_multiply (a, sa); 1.1274 + a = negate (a); 1.1275 + d = pix_add_mul (d, a, s, da); 1.1276 + store8888 (dest, d); 1.1277 + 1.1278 + ++src; 1.1279 + ++dest; 1.1280 + ++mask; 1.1281 + } 1.1282 + _mm_empty (); 1.1283 +} 1.1284 + 1.1285 +static void 1.1286 +mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, 1.1287 + pixman_op_t op, 1.1288 + uint32_t * dest, 1.1289 + const uint32_t * src, 1.1290 + const uint32_t * mask, 1.1291 + int width) 1.1292 +{ 1.1293 + const uint32_t *end = src + width; 1.1294 + 1.1295 + while (src < end) 1.1296 + { 1.1297 + __m64 a = load8888 (mask); 1.1298 + __m64 s = load8888 (src); 1.1299 + __m64 d = load8888 (dest); 1.1300 + __m64 da = expand_alpha (d); 1.1301 + __m64 sa = expand_alpha (s); 1.1302 + 1.1303 + s = pix_multiply (s, a); 1.1304 + a = pix_multiply (a, sa); 1.1305 + da = negate (da); 1.1306 + d = pix_add_mul (d, a, s, da); 1.1307 + store8888 (dest, d); 1.1308 + 1.1309 + ++src; 1.1310 + ++dest; 1.1311 + ++mask; 1.1312 + } 1.1313 + _mm_empty (); 1.1314 +} 1.1315 + 1.1316 +static void 1.1317 +mmx_combine_xor_ca (pixman_implementation_t *imp, 1.1318 + pixman_op_t op, 1.1319 + uint32_t * dest, 1.1320 + const uint32_t * src, 1.1321 + const uint32_t * mask, 1.1322 + int width) 1.1323 +{ 1.1324 + const uint32_t *end = src + width; 1.1325 + 1.1326 + while (src < end) 1.1327 + { 1.1328 + __m64 a = load8888 (mask); 1.1329 + __m64 s = load8888 (src); 1.1330 + __m64 d = load8888 (dest); 1.1331 + __m64 da = expand_alpha (d); 1.1332 + __m64 sa = expand_alpha (s); 1.1333 + 1.1334 + s = pix_multiply (s, a); 1.1335 + a = pix_multiply (a, sa); 1.1336 + da = negate (da); 1.1337 + a = negate (a); 1.1338 + d = pix_add_mul (d, a, s, da); 1.1339 + store8888 (dest, d); 1.1340 + 1.1341 + ++src; 1.1342 + ++dest; 1.1343 + ++mask; 1.1344 + } 1.1345 + _mm_empty (); 1.1346 +} 1.1347 + 1.1348 +static void 1.1349 +mmx_combine_add_ca (pixman_implementation_t *imp, 1.1350 + pixman_op_t op, 1.1351 + uint32_t * dest, 1.1352 + const uint32_t * src, 1.1353 + const uint32_t * mask, 1.1354 + int width) 1.1355 +{ 1.1356 + const uint32_t *end = src + width; 1.1357 + 1.1358 + while (src < end) 1.1359 + { 1.1360 + __m64 a = load8888 (mask); 1.1361 + __m64 s = load8888 (src); 1.1362 + __m64 d = load8888 (dest); 1.1363 + 1.1364 + s = pix_multiply (s, a); 1.1365 + d = pix_add (s, d); 1.1366 + store8888 (dest, d); 1.1367 + 1.1368 + ++src; 1.1369 + ++dest; 1.1370 + ++mask; 1.1371 + } 1.1372 + _mm_empty (); 1.1373 +} 1.1374 + 1.1375 +/* ------------- MMX code paths called from fbpict.c -------------------- */ 1.1376 + 1.1377 +static void 1.1378 +mmx_composite_over_n_8888 (pixman_implementation_t *imp, 1.1379 + pixman_composite_info_t *info) 1.1380 +{ 1.1381 + PIXMAN_COMPOSITE_ARGS (info); 1.1382 + uint32_t src; 1.1383 + uint32_t *dst_line, *dst; 1.1384 + int32_t w; 1.1385 + int dst_stride; 1.1386 + __m64 vsrc, vsrca; 1.1387 + 1.1388 + CHECKPOINT (); 1.1389 + 1.1390 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.1391 + 1.1392 + if (src == 0) 1.1393 + return; 1.1394 + 1.1395 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.1396 + 1.1397 + vsrc = load8888 (&src); 1.1398 + vsrca = expand_alpha (vsrc); 1.1399 + 1.1400 + while (height--) 1.1401 + { 1.1402 + dst = dst_line; 1.1403 + dst_line += dst_stride; 1.1404 + w = width; 1.1405 + 1.1406 + CHECKPOINT (); 1.1407 + 1.1408 + while (w && (uintptr_t)dst & 7) 1.1409 + { 1.1410 + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); 1.1411 + 1.1412 + w--; 1.1413 + dst++; 1.1414 + } 1.1415 + 1.1416 + while (w >= 2) 1.1417 + { 1.1418 + __m64 vdest; 1.1419 + __m64 dest0, dest1; 1.1420 + 1.1421 + vdest = *(__m64 *)dst; 1.1422 + 1.1423 + dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); 1.1424 + dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); 1.1425 + 1.1426 + *(__m64 *)dst = pack8888 (dest0, dest1); 1.1427 + 1.1428 + dst += 2; 1.1429 + w -= 2; 1.1430 + } 1.1431 + 1.1432 + CHECKPOINT (); 1.1433 + 1.1434 + if (w) 1.1435 + { 1.1436 + store8888 (dst, over (vsrc, vsrca, load8888 (dst))); 1.1437 + } 1.1438 + } 1.1439 + 1.1440 + _mm_empty (); 1.1441 +} 1.1442 + 1.1443 +static void 1.1444 +mmx_composite_over_n_0565 (pixman_implementation_t *imp, 1.1445 + pixman_composite_info_t *info) 1.1446 +{ 1.1447 + PIXMAN_COMPOSITE_ARGS (info); 1.1448 + uint32_t src; 1.1449 + uint16_t *dst_line, *dst; 1.1450 + int32_t w; 1.1451 + int dst_stride; 1.1452 + __m64 vsrc, vsrca; 1.1453 + 1.1454 + CHECKPOINT (); 1.1455 + 1.1456 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.1457 + 1.1458 + if (src == 0) 1.1459 + return; 1.1460 + 1.1461 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.1462 + 1.1463 + vsrc = load8888 (&src); 1.1464 + vsrca = expand_alpha (vsrc); 1.1465 + 1.1466 + while (height--) 1.1467 + { 1.1468 + dst = dst_line; 1.1469 + dst_line += dst_stride; 1.1470 + w = width; 1.1471 + 1.1472 + CHECKPOINT (); 1.1473 + 1.1474 + while (w && (uintptr_t)dst & 7) 1.1475 + { 1.1476 + uint64_t d = *dst; 1.1477 + __m64 vdest = expand565 (to_m64 (d), 0); 1.1478 + 1.1479 + vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); 1.1480 + *dst = to_uint64 (vdest); 1.1481 + 1.1482 + w--; 1.1483 + dst++; 1.1484 + } 1.1485 + 1.1486 + while (w >= 4) 1.1487 + { 1.1488 + __m64 vdest = *(__m64 *)dst; 1.1489 + __m64 v0, v1, v2, v3; 1.1490 + 1.1491 + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1.1492 + 1.1493 + v0 = over (vsrc, vsrca, v0); 1.1494 + v1 = over (vsrc, vsrca, v1); 1.1495 + v2 = over (vsrc, vsrca, v2); 1.1496 + v3 = over (vsrc, vsrca, v3); 1.1497 + 1.1498 + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1.1499 + 1.1500 + dst += 4; 1.1501 + w -= 4; 1.1502 + } 1.1503 + 1.1504 + CHECKPOINT (); 1.1505 + 1.1506 + while (w) 1.1507 + { 1.1508 + uint64_t d = *dst; 1.1509 + __m64 vdest = expand565 (to_m64 (d), 0); 1.1510 + 1.1511 + vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); 1.1512 + *dst = to_uint64 (vdest); 1.1513 + 1.1514 + w--; 1.1515 + dst++; 1.1516 + } 1.1517 + } 1.1518 + 1.1519 + _mm_empty (); 1.1520 +} 1.1521 + 1.1522 +static void 1.1523 +mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, 1.1524 + pixman_composite_info_t *info) 1.1525 +{ 1.1526 + PIXMAN_COMPOSITE_ARGS (info); 1.1527 + uint32_t src; 1.1528 + uint32_t *dst_line; 1.1529 + uint32_t *mask_line; 1.1530 + int dst_stride, mask_stride; 1.1531 + __m64 vsrc, vsrca; 1.1532 + 1.1533 + CHECKPOINT (); 1.1534 + 1.1535 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.1536 + 1.1537 + if (src == 0) 1.1538 + return; 1.1539 + 1.1540 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.1541 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1.1542 + 1.1543 + vsrc = load8888 (&src); 1.1544 + vsrca = expand_alpha (vsrc); 1.1545 + 1.1546 + while (height--) 1.1547 + { 1.1548 + int twidth = width; 1.1549 + uint32_t *p = (uint32_t *)mask_line; 1.1550 + uint32_t *q = (uint32_t *)dst_line; 1.1551 + 1.1552 + while (twidth && (uintptr_t)q & 7) 1.1553 + { 1.1554 + uint32_t m = *(uint32_t *)p; 1.1555 + 1.1556 + if (m) 1.1557 + { 1.1558 + __m64 vdest = load8888 (q); 1.1559 + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); 1.1560 + store8888 (q, vdest); 1.1561 + } 1.1562 + 1.1563 + twidth--; 1.1564 + p++; 1.1565 + q++; 1.1566 + } 1.1567 + 1.1568 + while (twidth >= 2) 1.1569 + { 1.1570 + uint32_t m0, m1; 1.1571 + m0 = *p; 1.1572 + m1 = *(p + 1); 1.1573 + 1.1574 + if (m0 | m1) 1.1575 + { 1.1576 + __m64 dest0, dest1; 1.1577 + __m64 vdest = *(__m64 *)q; 1.1578 + 1.1579 + dest0 = in_over (vsrc, vsrca, load8888 (&m0), 1.1580 + expand8888 (vdest, 0)); 1.1581 + dest1 = in_over (vsrc, vsrca, load8888 (&m1), 1.1582 + expand8888 (vdest, 1)); 1.1583 + 1.1584 + *(__m64 *)q = pack8888 (dest0, dest1); 1.1585 + } 1.1586 + 1.1587 + p += 2; 1.1588 + q += 2; 1.1589 + twidth -= 2; 1.1590 + } 1.1591 + 1.1592 + if (twidth) 1.1593 + { 1.1594 + uint32_t m = *(uint32_t *)p; 1.1595 + 1.1596 + if (m) 1.1597 + { 1.1598 + __m64 vdest = load8888 (q); 1.1599 + vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); 1.1600 + store8888 (q, vdest); 1.1601 + } 1.1602 + 1.1603 + twidth--; 1.1604 + p++; 1.1605 + q++; 1.1606 + } 1.1607 + 1.1608 + dst_line += dst_stride; 1.1609 + mask_line += mask_stride; 1.1610 + } 1.1611 + 1.1612 + _mm_empty (); 1.1613 +} 1.1614 + 1.1615 +static void 1.1616 +mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, 1.1617 + pixman_composite_info_t *info) 1.1618 +{ 1.1619 + PIXMAN_COMPOSITE_ARGS (info); 1.1620 + uint32_t *dst_line, *dst; 1.1621 + uint32_t *src_line, *src; 1.1622 + uint32_t mask; 1.1623 + __m64 vmask; 1.1624 + int dst_stride, src_stride; 1.1625 + int32_t w; 1.1626 + 1.1627 + CHECKPOINT (); 1.1628 + 1.1629 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.1630 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.1631 + 1.1632 + mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); 1.1633 + vmask = expand_alpha (load8888 (&mask)); 1.1634 + 1.1635 + while (height--) 1.1636 + { 1.1637 + dst = dst_line; 1.1638 + dst_line += dst_stride; 1.1639 + src = src_line; 1.1640 + src_line += src_stride; 1.1641 + w = width; 1.1642 + 1.1643 + while (w && (uintptr_t)dst & 7) 1.1644 + { 1.1645 + __m64 s = load8888 (src); 1.1646 + __m64 d = load8888 (dst); 1.1647 + 1.1648 + store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); 1.1649 + 1.1650 + w--; 1.1651 + dst++; 1.1652 + src++; 1.1653 + } 1.1654 + 1.1655 + while (w >= 2) 1.1656 + { 1.1657 + __m64 vs = ldq_u ((__m64 *)src); 1.1658 + __m64 vd = *(__m64 *)dst; 1.1659 + __m64 vsrc0 = expand8888 (vs, 0); 1.1660 + __m64 vsrc1 = expand8888 (vs, 1); 1.1661 + 1.1662 + *(__m64 *)dst = pack8888 ( 1.1663 + in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), 1.1664 + in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); 1.1665 + 1.1666 + w -= 2; 1.1667 + dst += 2; 1.1668 + src += 2; 1.1669 + } 1.1670 + 1.1671 + if (w) 1.1672 + { 1.1673 + __m64 s = load8888 (src); 1.1674 + __m64 d = load8888 (dst); 1.1675 + 1.1676 + store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); 1.1677 + } 1.1678 + } 1.1679 + 1.1680 + _mm_empty (); 1.1681 +} 1.1682 + 1.1683 +static void 1.1684 +mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, 1.1685 + pixman_composite_info_t *info) 1.1686 +{ 1.1687 + PIXMAN_COMPOSITE_ARGS (info); 1.1688 + uint32_t *dst_line, *dst; 1.1689 + uint32_t *src_line, *src; 1.1690 + uint32_t mask; 1.1691 + __m64 vmask; 1.1692 + int dst_stride, src_stride; 1.1693 + int32_t w; 1.1694 + __m64 srca; 1.1695 + 1.1696 + CHECKPOINT (); 1.1697 + 1.1698 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.1699 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.1700 + mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); 1.1701 + 1.1702 + vmask = expand_alpha (load8888 (&mask)); 1.1703 + srca = MC (4x00ff); 1.1704 + 1.1705 + while (height--) 1.1706 + { 1.1707 + dst = dst_line; 1.1708 + dst_line += dst_stride; 1.1709 + src = src_line; 1.1710 + src_line += src_stride; 1.1711 + w = width; 1.1712 + 1.1713 + while (w && (uintptr_t)dst & 7) 1.1714 + { 1.1715 + uint32_t ssrc = *src | 0xff000000; 1.1716 + __m64 s = load8888 (&ssrc); 1.1717 + __m64 d = load8888 (dst); 1.1718 + 1.1719 + store8888 (dst, in_over (s, srca, vmask, d)); 1.1720 + 1.1721 + w--; 1.1722 + dst++; 1.1723 + src++; 1.1724 + } 1.1725 + 1.1726 + while (w >= 16) 1.1727 + { 1.1728 + __m64 vd0 = *(__m64 *)(dst + 0); 1.1729 + __m64 vd1 = *(__m64 *)(dst + 2); 1.1730 + __m64 vd2 = *(__m64 *)(dst + 4); 1.1731 + __m64 vd3 = *(__m64 *)(dst + 6); 1.1732 + __m64 vd4 = *(__m64 *)(dst + 8); 1.1733 + __m64 vd5 = *(__m64 *)(dst + 10); 1.1734 + __m64 vd6 = *(__m64 *)(dst + 12); 1.1735 + __m64 vd7 = *(__m64 *)(dst + 14); 1.1736 + 1.1737 + __m64 vs0 = ldq_u ((__m64 *)(src + 0)); 1.1738 + __m64 vs1 = ldq_u ((__m64 *)(src + 2)); 1.1739 + __m64 vs2 = ldq_u ((__m64 *)(src + 4)); 1.1740 + __m64 vs3 = ldq_u ((__m64 *)(src + 6)); 1.1741 + __m64 vs4 = ldq_u ((__m64 *)(src + 8)); 1.1742 + __m64 vs5 = ldq_u ((__m64 *)(src + 10)); 1.1743 + __m64 vs6 = ldq_u ((__m64 *)(src + 12)); 1.1744 + __m64 vs7 = ldq_u ((__m64 *)(src + 14)); 1.1745 + 1.1746 + vd0 = pack8888 ( 1.1747 + in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), 1.1748 + in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); 1.1749 + 1.1750 + vd1 = pack8888 ( 1.1751 + in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), 1.1752 + in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); 1.1753 + 1.1754 + vd2 = pack8888 ( 1.1755 + in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), 1.1756 + in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); 1.1757 + 1.1758 + vd3 = pack8888 ( 1.1759 + in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), 1.1760 + in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); 1.1761 + 1.1762 + vd4 = pack8888 ( 1.1763 + in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), 1.1764 + in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); 1.1765 + 1.1766 + vd5 = pack8888 ( 1.1767 + in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), 1.1768 + in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); 1.1769 + 1.1770 + vd6 = pack8888 ( 1.1771 + in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), 1.1772 + in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); 1.1773 + 1.1774 + vd7 = pack8888 ( 1.1775 + in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), 1.1776 + in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); 1.1777 + 1.1778 + *(__m64 *)(dst + 0) = vd0; 1.1779 + *(__m64 *)(dst + 2) = vd1; 1.1780 + *(__m64 *)(dst + 4) = vd2; 1.1781 + *(__m64 *)(dst + 6) = vd3; 1.1782 + *(__m64 *)(dst + 8) = vd4; 1.1783 + *(__m64 *)(dst + 10) = vd5; 1.1784 + *(__m64 *)(dst + 12) = vd6; 1.1785 + *(__m64 *)(dst + 14) = vd7; 1.1786 + 1.1787 + w -= 16; 1.1788 + dst += 16; 1.1789 + src += 16; 1.1790 + } 1.1791 + 1.1792 + while (w) 1.1793 + { 1.1794 + uint32_t ssrc = *src | 0xff000000; 1.1795 + __m64 s = load8888 (&ssrc); 1.1796 + __m64 d = load8888 (dst); 1.1797 + 1.1798 + store8888 (dst, in_over (s, srca, vmask, d)); 1.1799 + 1.1800 + w--; 1.1801 + dst++; 1.1802 + src++; 1.1803 + } 1.1804 + } 1.1805 + 1.1806 + _mm_empty (); 1.1807 +} 1.1808 + 1.1809 +static void 1.1810 +mmx_composite_over_8888_8888 (pixman_implementation_t *imp, 1.1811 + pixman_composite_info_t *info) 1.1812 +{ 1.1813 + PIXMAN_COMPOSITE_ARGS (info); 1.1814 + uint32_t *dst_line, *dst; 1.1815 + uint32_t *src_line, *src; 1.1816 + uint32_t s; 1.1817 + int dst_stride, src_stride; 1.1818 + uint8_t a; 1.1819 + int32_t w; 1.1820 + 1.1821 + CHECKPOINT (); 1.1822 + 1.1823 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.1824 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.1825 + 1.1826 + while (height--) 1.1827 + { 1.1828 + dst = dst_line; 1.1829 + dst_line += dst_stride; 1.1830 + src = src_line; 1.1831 + src_line += src_stride; 1.1832 + w = width; 1.1833 + 1.1834 + while (w--) 1.1835 + { 1.1836 + s = *src++; 1.1837 + a = s >> 24; 1.1838 + 1.1839 + if (a == 0xff) 1.1840 + { 1.1841 + *dst = s; 1.1842 + } 1.1843 + else if (s) 1.1844 + { 1.1845 + __m64 ms, sa; 1.1846 + ms = load8888 (&s); 1.1847 + sa = expand_alpha (ms); 1.1848 + store8888 (dst, over (ms, sa, load8888 (dst))); 1.1849 + } 1.1850 + 1.1851 + dst++; 1.1852 + } 1.1853 + } 1.1854 + _mm_empty (); 1.1855 +} 1.1856 + 1.1857 +static void 1.1858 +mmx_composite_over_8888_0565 (pixman_implementation_t *imp, 1.1859 + pixman_composite_info_t *info) 1.1860 +{ 1.1861 + PIXMAN_COMPOSITE_ARGS (info); 1.1862 + uint16_t *dst_line, *dst; 1.1863 + uint32_t *src_line, *src; 1.1864 + int dst_stride, src_stride; 1.1865 + int32_t w; 1.1866 + 1.1867 + CHECKPOINT (); 1.1868 + 1.1869 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.1870 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.1871 + 1.1872 +#if 0 1.1873 + /* FIXME */ 1.1874 + assert (src_image->drawable == mask_image->drawable); 1.1875 +#endif 1.1876 + 1.1877 + while (height--) 1.1878 + { 1.1879 + dst = dst_line; 1.1880 + dst_line += dst_stride; 1.1881 + src = src_line; 1.1882 + src_line += src_stride; 1.1883 + w = width; 1.1884 + 1.1885 + CHECKPOINT (); 1.1886 + 1.1887 + while (w && (uintptr_t)dst & 7) 1.1888 + { 1.1889 + __m64 vsrc = load8888 (src); 1.1890 + uint64_t d = *dst; 1.1891 + __m64 vdest = expand565 (to_m64 (d), 0); 1.1892 + 1.1893 + vdest = pack_565 ( 1.1894 + over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); 1.1895 + 1.1896 + *dst = to_uint64 (vdest); 1.1897 + 1.1898 + w--; 1.1899 + dst++; 1.1900 + src++; 1.1901 + } 1.1902 + 1.1903 + CHECKPOINT (); 1.1904 + 1.1905 + while (w >= 4) 1.1906 + { 1.1907 + __m64 vdest = *(__m64 *)dst; 1.1908 + __m64 v0, v1, v2, v3; 1.1909 + __m64 vsrc0, vsrc1, vsrc2, vsrc3; 1.1910 + 1.1911 + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1.1912 + 1.1913 + vsrc0 = load8888 ((src + 0)); 1.1914 + vsrc1 = load8888 ((src + 1)); 1.1915 + vsrc2 = load8888 ((src + 2)); 1.1916 + vsrc3 = load8888 ((src + 3)); 1.1917 + 1.1918 + v0 = over (vsrc0, expand_alpha (vsrc0), v0); 1.1919 + v1 = over (vsrc1, expand_alpha (vsrc1), v1); 1.1920 + v2 = over (vsrc2, expand_alpha (vsrc2), v2); 1.1921 + v3 = over (vsrc3, expand_alpha (vsrc3), v3); 1.1922 + 1.1923 + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1.1924 + 1.1925 + w -= 4; 1.1926 + dst += 4; 1.1927 + src += 4; 1.1928 + } 1.1929 + 1.1930 + CHECKPOINT (); 1.1931 + 1.1932 + while (w) 1.1933 + { 1.1934 + __m64 vsrc = load8888 (src); 1.1935 + uint64_t d = *dst; 1.1936 + __m64 vdest = expand565 (to_m64 (d), 0); 1.1937 + 1.1938 + vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); 1.1939 + 1.1940 + *dst = to_uint64 (vdest); 1.1941 + 1.1942 + w--; 1.1943 + dst++; 1.1944 + src++; 1.1945 + } 1.1946 + } 1.1947 + 1.1948 + _mm_empty (); 1.1949 +} 1.1950 + 1.1951 +static void 1.1952 +mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, 1.1953 + pixman_composite_info_t *info) 1.1954 +{ 1.1955 + PIXMAN_COMPOSITE_ARGS (info); 1.1956 + uint32_t src, srca; 1.1957 + uint32_t *dst_line, *dst; 1.1958 + uint8_t *mask_line, *mask; 1.1959 + int dst_stride, mask_stride; 1.1960 + int32_t w; 1.1961 + __m64 vsrc, vsrca; 1.1962 + uint64_t srcsrc; 1.1963 + 1.1964 + CHECKPOINT (); 1.1965 + 1.1966 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.1967 + 1.1968 + srca = src >> 24; 1.1969 + if (src == 0) 1.1970 + return; 1.1971 + 1.1972 + srcsrc = (uint64_t)src << 32 | src; 1.1973 + 1.1974 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.1975 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.1976 + 1.1977 + vsrc = load8888 (&src); 1.1978 + vsrca = expand_alpha (vsrc); 1.1979 + 1.1980 + while (height--) 1.1981 + { 1.1982 + dst = dst_line; 1.1983 + dst_line += dst_stride; 1.1984 + mask = mask_line; 1.1985 + mask_line += mask_stride; 1.1986 + w = width; 1.1987 + 1.1988 + CHECKPOINT (); 1.1989 + 1.1990 + while (w && (uintptr_t)dst & 7) 1.1991 + { 1.1992 + uint64_t m = *mask; 1.1993 + 1.1994 + if (m) 1.1995 + { 1.1996 + __m64 vdest = in_over (vsrc, vsrca, 1.1997 + expand_alpha_rev (to_m64 (m)), 1.1998 + load8888 (dst)); 1.1999 + 1.2000 + store8888 (dst, vdest); 1.2001 + } 1.2002 + 1.2003 + w--; 1.2004 + mask++; 1.2005 + dst++; 1.2006 + } 1.2007 + 1.2008 + CHECKPOINT (); 1.2009 + 1.2010 + while (w >= 2) 1.2011 + { 1.2012 + uint64_t m0, m1; 1.2013 + 1.2014 + m0 = *mask; 1.2015 + m1 = *(mask + 1); 1.2016 + 1.2017 + if (srca == 0xff && (m0 & m1) == 0xff) 1.2018 + { 1.2019 + *(uint64_t *)dst = srcsrc; 1.2020 + } 1.2021 + else if (m0 | m1) 1.2022 + { 1.2023 + __m64 vdest; 1.2024 + __m64 dest0, dest1; 1.2025 + 1.2026 + vdest = *(__m64 *)dst; 1.2027 + 1.2028 + dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), 1.2029 + expand8888 (vdest, 0)); 1.2030 + dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), 1.2031 + expand8888 (vdest, 1)); 1.2032 + 1.2033 + *(__m64 *)dst = pack8888 (dest0, dest1); 1.2034 + } 1.2035 + 1.2036 + mask += 2; 1.2037 + dst += 2; 1.2038 + w -= 2; 1.2039 + } 1.2040 + 1.2041 + CHECKPOINT (); 1.2042 + 1.2043 + if (w) 1.2044 + { 1.2045 + uint64_t m = *mask; 1.2046 + 1.2047 + if (m) 1.2048 + { 1.2049 + __m64 vdest = load8888 (dst); 1.2050 + 1.2051 + vdest = in_over ( 1.2052 + vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); 1.2053 + store8888 (dst, vdest); 1.2054 + } 1.2055 + } 1.2056 + } 1.2057 + 1.2058 + _mm_empty (); 1.2059 +} 1.2060 + 1.2061 +static pixman_bool_t 1.2062 +mmx_fill (pixman_implementation_t *imp, 1.2063 + uint32_t * bits, 1.2064 + int stride, 1.2065 + int bpp, 1.2066 + int x, 1.2067 + int y, 1.2068 + int width, 1.2069 + int height, 1.2070 + uint32_t filler) 1.2071 +{ 1.2072 + uint64_t fill; 1.2073 + __m64 vfill; 1.2074 + uint32_t byte_width; 1.2075 + uint8_t *byte_line; 1.2076 + 1.2077 +#if defined __GNUC__ && defined USE_X86_MMX 1.2078 + __m64 v1, v2, v3, v4, v5, v6, v7; 1.2079 +#endif 1.2080 + 1.2081 + if (bpp != 16 && bpp != 32 && bpp != 8) 1.2082 + return FALSE; 1.2083 + 1.2084 + if (bpp == 8) 1.2085 + { 1.2086 + stride = stride * (int) sizeof (uint32_t) / 1; 1.2087 + byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); 1.2088 + byte_width = width; 1.2089 + stride *= 1; 1.2090 + filler = (filler & 0xff) * 0x01010101; 1.2091 + } 1.2092 + else if (bpp == 16) 1.2093 + { 1.2094 + stride = stride * (int) sizeof (uint32_t) / 2; 1.2095 + byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); 1.2096 + byte_width = 2 * width; 1.2097 + stride *= 2; 1.2098 + filler = (filler & 0xffff) * 0x00010001; 1.2099 + } 1.2100 + else 1.2101 + { 1.2102 + stride = stride * (int) sizeof (uint32_t) / 4; 1.2103 + byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); 1.2104 + byte_width = 4 * width; 1.2105 + stride *= 4; 1.2106 + } 1.2107 + 1.2108 + fill = ((uint64_t)filler << 32) | filler; 1.2109 + vfill = to_m64 (fill); 1.2110 + 1.2111 +#if defined __GNUC__ && defined USE_X86_MMX 1.2112 + __asm__ ( 1.2113 + "movq %7, %0\n" 1.2114 + "movq %7, %1\n" 1.2115 + "movq %7, %2\n" 1.2116 + "movq %7, %3\n" 1.2117 + "movq %7, %4\n" 1.2118 + "movq %7, %5\n" 1.2119 + "movq %7, %6\n" 1.2120 + : "=&y" (v1), "=&y" (v2), "=&y" (v3), 1.2121 + "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) 1.2122 + : "y" (vfill)); 1.2123 +#endif 1.2124 + 1.2125 + while (height--) 1.2126 + { 1.2127 + int w; 1.2128 + uint8_t *d = byte_line; 1.2129 + 1.2130 + byte_line += stride; 1.2131 + w = byte_width; 1.2132 + 1.2133 + if (w >= 1 && ((uintptr_t)d & 1)) 1.2134 + { 1.2135 + *(uint8_t *)d = (filler & 0xff); 1.2136 + w--; 1.2137 + d++; 1.2138 + } 1.2139 + 1.2140 + if (w >= 2 && ((uintptr_t)d & 3)) 1.2141 + { 1.2142 + *(uint16_t *)d = filler; 1.2143 + w -= 2; 1.2144 + d += 2; 1.2145 + } 1.2146 + 1.2147 + while (w >= 4 && ((uintptr_t)d & 7)) 1.2148 + { 1.2149 + *(uint32_t *)d = filler; 1.2150 + 1.2151 + w -= 4; 1.2152 + d += 4; 1.2153 + } 1.2154 + 1.2155 + while (w >= 64) 1.2156 + { 1.2157 +#if defined __GNUC__ && defined USE_X86_MMX 1.2158 + __asm__ ( 1.2159 + "movq %1, (%0)\n" 1.2160 + "movq %2, 8(%0)\n" 1.2161 + "movq %3, 16(%0)\n" 1.2162 + "movq %4, 24(%0)\n" 1.2163 + "movq %5, 32(%0)\n" 1.2164 + "movq %6, 40(%0)\n" 1.2165 + "movq %7, 48(%0)\n" 1.2166 + "movq %8, 56(%0)\n" 1.2167 + : 1.2168 + : "r" (d), 1.2169 + "y" (vfill), "y" (v1), "y" (v2), "y" (v3), 1.2170 + "y" (v4), "y" (v5), "y" (v6), "y" (v7) 1.2171 + : "memory"); 1.2172 +#else 1.2173 + *(__m64*) (d + 0) = vfill; 1.2174 + *(__m64*) (d + 8) = vfill; 1.2175 + *(__m64*) (d + 16) = vfill; 1.2176 + *(__m64*) (d + 24) = vfill; 1.2177 + *(__m64*) (d + 32) = vfill; 1.2178 + *(__m64*) (d + 40) = vfill; 1.2179 + *(__m64*) (d + 48) = vfill; 1.2180 + *(__m64*) (d + 56) = vfill; 1.2181 +#endif 1.2182 + w -= 64; 1.2183 + d += 64; 1.2184 + } 1.2185 + 1.2186 + while (w >= 4) 1.2187 + { 1.2188 + *(uint32_t *)d = filler; 1.2189 + 1.2190 + w -= 4; 1.2191 + d += 4; 1.2192 + } 1.2193 + if (w >= 2) 1.2194 + { 1.2195 + *(uint16_t *)d = filler; 1.2196 + w -= 2; 1.2197 + d += 2; 1.2198 + } 1.2199 + if (w >= 1) 1.2200 + { 1.2201 + *(uint8_t *)d = (filler & 0xff); 1.2202 + w--; 1.2203 + d++; 1.2204 + } 1.2205 + 1.2206 + } 1.2207 + 1.2208 + _mm_empty (); 1.2209 + return TRUE; 1.2210 +} 1.2211 + 1.2212 +static void 1.2213 +mmx_composite_src_x888_0565 (pixman_implementation_t *imp, 1.2214 + pixman_composite_info_t *info) 1.2215 +{ 1.2216 + PIXMAN_COMPOSITE_ARGS (info); 1.2217 + uint16_t *dst_line, *dst; 1.2218 + uint32_t *src_line, *src, s; 1.2219 + int dst_stride, src_stride; 1.2220 + int32_t w; 1.2221 + 1.2222 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2223 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.2224 + 1.2225 + while (height--) 1.2226 + { 1.2227 + dst = dst_line; 1.2228 + dst_line += dst_stride; 1.2229 + src = src_line; 1.2230 + src_line += src_stride; 1.2231 + w = width; 1.2232 + 1.2233 + while (w && (uintptr_t)dst & 7) 1.2234 + { 1.2235 + s = *src++; 1.2236 + *dst = convert_8888_to_0565 (s); 1.2237 + dst++; 1.2238 + w--; 1.2239 + } 1.2240 + 1.2241 + while (w >= 4) 1.2242 + { 1.2243 + __m64 vdest; 1.2244 + __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); 1.2245 + __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); 1.2246 + 1.2247 + vdest = pack_4xpacked565 (vsrc0, vsrc1); 1.2248 + 1.2249 + *(__m64 *)dst = vdest; 1.2250 + 1.2251 + w -= 4; 1.2252 + src += 4; 1.2253 + dst += 4; 1.2254 + } 1.2255 + 1.2256 + while (w) 1.2257 + { 1.2258 + s = *src++; 1.2259 + *dst = convert_8888_to_0565 (s); 1.2260 + dst++; 1.2261 + w--; 1.2262 + } 1.2263 + } 1.2264 + 1.2265 + _mm_empty (); 1.2266 +} 1.2267 + 1.2268 +static void 1.2269 +mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, 1.2270 + pixman_composite_info_t *info) 1.2271 +{ 1.2272 + PIXMAN_COMPOSITE_ARGS (info); 1.2273 + uint32_t src, srca; 1.2274 + uint32_t *dst_line, *dst; 1.2275 + uint8_t *mask_line, *mask; 1.2276 + int dst_stride, mask_stride; 1.2277 + int32_t w; 1.2278 + __m64 vsrc; 1.2279 + uint64_t srcsrc; 1.2280 + 1.2281 + CHECKPOINT (); 1.2282 + 1.2283 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2284 + 1.2285 + srca = src >> 24; 1.2286 + if (src == 0) 1.2287 + { 1.2288 + mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, 1.2289 + PIXMAN_FORMAT_BPP (dest_image->bits.format), 1.2290 + dest_x, dest_y, width, height, 0); 1.2291 + return; 1.2292 + } 1.2293 + 1.2294 + srcsrc = (uint64_t)src << 32 | src; 1.2295 + 1.2296 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2297 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.2298 + 1.2299 + vsrc = load8888 (&src); 1.2300 + 1.2301 + while (height--) 1.2302 + { 1.2303 + dst = dst_line; 1.2304 + dst_line += dst_stride; 1.2305 + mask = mask_line; 1.2306 + mask_line += mask_stride; 1.2307 + w = width; 1.2308 + 1.2309 + CHECKPOINT (); 1.2310 + 1.2311 + while (w && (uintptr_t)dst & 7) 1.2312 + { 1.2313 + uint64_t m = *mask; 1.2314 + 1.2315 + if (m) 1.2316 + { 1.2317 + __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); 1.2318 + 1.2319 + store8888 (dst, vdest); 1.2320 + } 1.2321 + else 1.2322 + { 1.2323 + *dst = 0; 1.2324 + } 1.2325 + 1.2326 + w--; 1.2327 + mask++; 1.2328 + dst++; 1.2329 + } 1.2330 + 1.2331 + CHECKPOINT (); 1.2332 + 1.2333 + while (w >= 2) 1.2334 + { 1.2335 + uint64_t m0, m1; 1.2336 + m0 = *mask; 1.2337 + m1 = *(mask + 1); 1.2338 + 1.2339 + if (srca == 0xff && (m0 & m1) == 0xff) 1.2340 + { 1.2341 + *(uint64_t *)dst = srcsrc; 1.2342 + } 1.2343 + else if (m0 | m1) 1.2344 + { 1.2345 + __m64 dest0, dest1; 1.2346 + 1.2347 + dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); 1.2348 + dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); 1.2349 + 1.2350 + *(__m64 *)dst = pack8888 (dest0, dest1); 1.2351 + } 1.2352 + else 1.2353 + { 1.2354 + *(uint64_t *)dst = 0; 1.2355 + } 1.2356 + 1.2357 + mask += 2; 1.2358 + dst += 2; 1.2359 + w -= 2; 1.2360 + } 1.2361 + 1.2362 + CHECKPOINT (); 1.2363 + 1.2364 + if (w) 1.2365 + { 1.2366 + uint64_t m = *mask; 1.2367 + 1.2368 + if (m) 1.2369 + { 1.2370 + __m64 vdest = load8888 (dst); 1.2371 + 1.2372 + vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); 1.2373 + store8888 (dst, vdest); 1.2374 + } 1.2375 + else 1.2376 + { 1.2377 + *dst = 0; 1.2378 + } 1.2379 + } 1.2380 + } 1.2381 + 1.2382 + _mm_empty (); 1.2383 +} 1.2384 + 1.2385 +static void 1.2386 +mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, 1.2387 + pixman_composite_info_t *info) 1.2388 +{ 1.2389 + PIXMAN_COMPOSITE_ARGS (info); 1.2390 + uint32_t src, srca; 1.2391 + uint16_t *dst_line, *dst; 1.2392 + uint8_t *mask_line, *mask; 1.2393 + int dst_stride, mask_stride; 1.2394 + int32_t w; 1.2395 + __m64 vsrc, vsrca, tmp; 1.2396 + __m64 srcsrcsrcsrc; 1.2397 + 1.2398 + CHECKPOINT (); 1.2399 + 1.2400 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2401 + 1.2402 + srca = src >> 24; 1.2403 + if (src == 0) 1.2404 + return; 1.2405 + 1.2406 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.2407 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.2408 + 1.2409 + vsrc = load8888 (&src); 1.2410 + vsrca = expand_alpha (vsrc); 1.2411 + 1.2412 + tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); 1.2413 + srcsrcsrcsrc = expand_alpha_rev (tmp); 1.2414 + 1.2415 + while (height--) 1.2416 + { 1.2417 + dst = dst_line; 1.2418 + dst_line += dst_stride; 1.2419 + mask = mask_line; 1.2420 + mask_line += mask_stride; 1.2421 + w = width; 1.2422 + 1.2423 + CHECKPOINT (); 1.2424 + 1.2425 + while (w && (uintptr_t)dst & 7) 1.2426 + { 1.2427 + uint64_t m = *mask; 1.2428 + 1.2429 + if (m) 1.2430 + { 1.2431 + uint64_t d = *dst; 1.2432 + __m64 vd = to_m64 (d); 1.2433 + __m64 vdest = in_over ( 1.2434 + vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); 1.2435 + 1.2436 + vd = pack_565 (vdest, _mm_setzero_si64 (), 0); 1.2437 + *dst = to_uint64 (vd); 1.2438 + } 1.2439 + 1.2440 + w--; 1.2441 + mask++; 1.2442 + dst++; 1.2443 + } 1.2444 + 1.2445 + CHECKPOINT (); 1.2446 + 1.2447 + while (w >= 4) 1.2448 + { 1.2449 + uint64_t m0, m1, m2, m3; 1.2450 + m0 = *mask; 1.2451 + m1 = *(mask + 1); 1.2452 + m2 = *(mask + 2); 1.2453 + m3 = *(mask + 3); 1.2454 + 1.2455 + if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) 1.2456 + { 1.2457 + *(__m64 *)dst = srcsrcsrcsrc; 1.2458 + } 1.2459 + else if (m0 | m1 | m2 | m3) 1.2460 + { 1.2461 + __m64 vdest = *(__m64 *)dst; 1.2462 + __m64 v0, v1, v2, v3; 1.2463 + __m64 vm0, vm1, vm2, vm3; 1.2464 + 1.2465 + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1.2466 + 1.2467 + vm0 = to_m64 (m0); 1.2468 + v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); 1.2469 + 1.2470 + vm1 = to_m64 (m1); 1.2471 + v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); 1.2472 + 1.2473 + vm2 = to_m64 (m2); 1.2474 + v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); 1.2475 + 1.2476 + vm3 = to_m64 (m3); 1.2477 + v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); 1.2478 + 1.2479 + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; 1.2480 + } 1.2481 + 1.2482 + w -= 4; 1.2483 + mask += 4; 1.2484 + dst += 4; 1.2485 + } 1.2486 + 1.2487 + CHECKPOINT (); 1.2488 + 1.2489 + while (w) 1.2490 + { 1.2491 + uint64_t m = *mask; 1.2492 + 1.2493 + if (m) 1.2494 + { 1.2495 + uint64_t d = *dst; 1.2496 + __m64 vd = to_m64 (d); 1.2497 + __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), 1.2498 + expand565 (vd, 0)); 1.2499 + vd = pack_565 (vdest, _mm_setzero_si64 (), 0); 1.2500 + *dst = to_uint64 (vd); 1.2501 + } 1.2502 + 1.2503 + w--; 1.2504 + mask++; 1.2505 + dst++; 1.2506 + } 1.2507 + } 1.2508 + 1.2509 + _mm_empty (); 1.2510 +} 1.2511 + 1.2512 +static void 1.2513 +mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, 1.2514 + pixman_composite_info_t *info) 1.2515 +{ 1.2516 + PIXMAN_COMPOSITE_ARGS (info); 1.2517 + uint16_t *dst_line, *dst; 1.2518 + uint32_t *src_line, *src; 1.2519 + int dst_stride, src_stride; 1.2520 + int32_t w; 1.2521 + 1.2522 + CHECKPOINT (); 1.2523 + 1.2524 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.2525 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2526 + 1.2527 +#if 0 1.2528 + /* FIXME */ 1.2529 + assert (src_image->drawable == mask_image->drawable); 1.2530 +#endif 1.2531 + 1.2532 + while (height--) 1.2533 + { 1.2534 + dst = dst_line; 1.2535 + dst_line += dst_stride; 1.2536 + src = src_line; 1.2537 + src_line += src_stride; 1.2538 + w = width; 1.2539 + 1.2540 + CHECKPOINT (); 1.2541 + 1.2542 + while (w && (uintptr_t)dst & 7) 1.2543 + { 1.2544 + __m64 vsrc = load8888 (src); 1.2545 + uint64_t d = *dst; 1.2546 + __m64 vdest = expand565 (to_m64 (d), 0); 1.2547 + 1.2548 + vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); 1.2549 + 1.2550 + *dst = to_uint64 (vdest); 1.2551 + 1.2552 + w--; 1.2553 + dst++; 1.2554 + src++; 1.2555 + } 1.2556 + 1.2557 + CHECKPOINT (); 1.2558 + 1.2559 + while (w >= 4) 1.2560 + { 1.2561 + uint32_t s0, s1, s2, s3; 1.2562 + unsigned char a0, a1, a2, a3; 1.2563 + 1.2564 + s0 = *src; 1.2565 + s1 = *(src + 1); 1.2566 + s2 = *(src + 2); 1.2567 + s3 = *(src + 3); 1.2568 + 1.2569 + a0 = (s0 >> 24); 1.2570 + a1 = (s1 >> 24); 1.2571 + a2 = (s2 >> 24); 1.2572 + a3 = (s3 >> 24); 1.2573 + 1.2574 + if ((a0 & a1 & a2 & a3) == 0xFF) 1.2575 + { 1.2576 + __m64 v0 = invert_colors (load8888 (&s0)); 1.2577 + __m64 v1 = invert_colors (load8888 (&s1)); 1.2578 + __m64 v2 = invert_colors (load8888 (&s2)); 1.2579 + __m64 v3 = invert_colors (load8888 (&s3)); 1.2580 + 1.2581 + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1.2582 + } 1.2583 + else if (s0 | s1 | s2 | s3) 1.2584 + { 1.2585 + __m64 vdest = *(__m64 *)dst; 1.2586 + __m64 v0, v1, v2, v3; 1.2587 + 1.2588 + __m64 vsrc0 = load8888 (&s0); 1.2589 + __m64 vsrc1 = load8888 (&s1); 1.2590 + __m64 vsrc2 = load8888 (&s2); 1.2591 + __m64 vsrc3 = load8888 (&s3); 1.2592 + 1.2593 + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1.2594 + 1.2595 + v0 = over_rev_non_pre (vsrc0, v0); 1.2596 + v1 = over_rev_non_pre (vsrc1, v1); 1.2597 + v2 = over_rev_non_pre (vsrc2, v2); 1.2598 + v3 = over_rev_non_pre (vsrc3, v3); 1.2599 + 1.2600 + *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); 1.2601 + } 1.2602 + 1.2603 + w -= 4; 1.2604 + dst += 4; 1.2605 + src += 4; 1.2606 + } 1.2607 + 1.2608 + CHECKPOINT (); 1.2609 + 1.2610 + while (w) 1.2611 + { 1.2612 + __m64 vsrc = load8888 (src); 1.2613 + uint64_t d = *dst; 1.2614 + __m64 vdest = expand565 (to_m64 (d), 0); 1.2615 + 1.2616 + vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); 1.2617 + 1.2618 + *dst = to_uint64 (vdest); 1.2619 + 1.2620 + w--; 1.2621 + dst++; 1.2622 + src++; 1.2623 + } 1.2624 + } 1.2625 + 1.2626 + _mm_empty (); 1.2627 +} 1.2628 + 1.2629 +static void 1.2630 +mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, 1.2631 + pixman_composite_info_t *info) 1.2632 +{ 1.2633 + PIXMAN_COMPOSITE_ARGS (info); 1.2634 + uint32_t *dst_line, *dst; 1.2635 + uint32_t *src_line, *src; 1.2636 + int dst_stride, src_stride; 1.2637 + int32_t w; 1.2638 + 1.2639 + CHECKPOINT (); 1.2640 + 1.2641 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.2642 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.2643 + 1.2644 +#if 0 1.2645 + /* FIXME */ 1.2646 + assert (src_image->drawable == mask_image->drawable); 1.2647 +#endif 1.2648 + 1.2649 + while (height--) 1.2650 + { 1.2651 + dst = dst_line; 1.2652 + dst_line += dst_stride; 1.2653 + src = src_line; 1.2654 + src_line += src_stride; 1.2655 + w = width; 1.2656 + 1.2657 + while (w && (uintptr_t)dst & 7) 1.2658 + { 1.2659 + __m64 s = load8888 (src); 1.2660 + __m64 d = load8888 (dst); 1.2661 + 1.2662 + store8888 (dst, over_rev_non_pre (s, d)); 1.2663 + 1.2664 + w--; 1.2665 + dst++; 1.2666 + src++; 1.2667 + } 1.2668 + 1.2669 + while (w >= 2) 1.2670 + { 1.2671 + uint32_t s0, s1; 1.2672 + unsigned char a0, a1; 1.2673 + __m64 d0, d1; 1.2674 + 1.2675 + s0 = *src; 1.2676 + s1 = *(src + 1); 1.2677 + 1.2678 + a0 = (s0 >> 24); 1.2679 + a1 = (s1 >> 24); 1.2680 + 1.2681 + if ((a0 & a1) == 0xFF) 1.2682 + { 1.2683 + d0 = invert_colors (load8888 (&s0)); 1.2684 + d1 = invert_colors (load8888 (&s1)); 1.2685 + 1.2686 + *(__m64 *)dst = pack8888 (d0, d1); 1.2687 + } 1.2688 + else if (s0 | s1) 1.2689 + { 1.2690 + __m64 vdest = *(__m64 *)dst; 1.2691 + 1.2692 + d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); 1.2693 + d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); 1.2694 + 1.2695 + *(__m64 *)dst = pack8888 (d0, d1); 1.2696 + } 1.2697 + 1.2698 + w -= 2; 1.2699 + dst += 2; 1.2700 + src += 2; 1.2701 + } 1.2702 + 1.2703 + if (w) 1.2704 + { 1.2705 + __m64 s = load8888 (src); 1.2706 + __m64 d = load8888 (dst); 1.2707 + 1.2708 + store8888 (dst, over_rev_non_pre (s, d)); 1.2709 + } 1.2710 + } 1.2711 + 1.2712 + _mm_empty (); 1.2713 +} 1.2714 + 1.2715 +static void 1.2716 +mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, 1.2717 + pixman_composite_info_t *info) 1.2718 +{ 1.2719 + PIXMAN_COMPOSITE_ARGS (info); 1.2720 + uint32_t src; 1.2721 + uint16_t *dst_line; 1.2722 + uint32_t *mask_line; 1.2723 + int dst_stride, mask_stride; 1.2724 + __m64 vsrc, vsrca; 1.2725 + 1.2726 + CHECKPOINT (); 1.2727 + 1.2728 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2729 + 1.2730 + if (src == 0) 1.2731 + return; 1.2732 + 1.2733 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.2734 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); 1.2735 + 1.2736 + vsrc = load8888 (&src); 1.2737 + vsrca = expand_alpha (vsrc); 1.2738 + 1.2739 + while (height--) 1.2740 + { 1.2741 + int twidth = width; 1.2742 + uint32_t *p = (uint32_t *)mask_line; 1.2743 + uint16_t *q = (uint16_t *)dst_line; 1.2744 + 1.2745 + while (twidth && ((uintptr_t)q & 7)) 1.2746 + { 1.2747 + uint32_t m = *(uint32_t *)p; 1.2748 + 1.2749 + if (m) 1.2750 + { 1.2751 + uint64_t d = *q; 1.2752 + __m64 vdest = expand565 (to_m64 (d), 0); 1.2753 + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); 1.2754 + *q = to_uint64 (vdest); 1.2755 + } 1.2756 + 1.2757 + twidth--; 1.2758 + p++; 1.2759 + q++; 1.2760 + } 1.2761 + 1.2762 + while (twidth >= 4) 1.2763 + { 1.2764 + uint32_t m0, m1, m2, m3; 1.2765 + 1.2766 + m0 = *p; 1.2767 + m1 = *(p + 1); 1.2768 + m2 = *(p + 2); 1.2769 + m3 = *(p + 3); 1.2770 + 1.2771 + if ((m0 | m1 | m2 | m3)) 1.2772 + { 1.2773 + __m64 vdest = *(__m64 *)q; 1.2774 + __m64 v0, v1, v2, v3; 1.2775 + 1.2776 + expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); 1.2777 + 1.2778 + v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); 1.2779 + v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); 1.2780 + v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); 1.2781 + v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); 1.2782 + 1.2783 + *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); 1.2784 + } 1.2785 + twidth -= 4; 1.2786 + p += 4; 1.2787 + q += 4; 1.2788 + } 1.2789 + 1.2790 + while (twidth) 1.2791 + { 1.2792 + uint32_t m; 1.2793 + 1.2794 + m = *(uint32_t *)p; 1.2795 + if (m) 1.2796 + { 1.2797 + uint64_t d = *q; 1.2798 + __m64 vdest = expand565 (to_m64 (d), 0); 1.2799 + vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); 1.2800 + *q = to_uint64 (vdest); 1.2801 + } 1.2802 + 1.2803 + twidth--; 1.2804 + p++; 1.2805 + q++; 1.2806 + } 1.2807 + 1.2808 + mask_line += mask_stride; 1.2809 + dst_line += dst_stride; 1.2810 + } 1.2811 + 1.2812 + _mm_empty (); 1.2813 +} 1.2814 + 1.2815 +static void 1.2816 +mmx_composite_in_n_8_8 (pixman_implementation_t *imp, 1.2817 + pixman_composite_info_t *info) 1.2818 +{ 1.2819 + PIXMAN_COMPOSITE_ARGS (info); 1.2820 + uint8_t *dst_line, *dst; 1.2821 + uint8_t *mask_line, *mask; 1.2822 + int dst_stride, mask_stride; 1.2823 + int32_t w; 1.2824 + uint32_t src; 1.2825 + uint8_t sa; 1.2826 + __m64 vsrc, vsrca; 1.2827 + 1.2828 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.2829 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.2830 + 1.2831 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2832 + 1.2833 + sa = src >> 24; 1.2834 + 1.2835 + vsrc = load8888 (&src); 1.2836 + vsrca = expand_alpha (vsrc); 1.2837 + 1.2838 + while (height--) 1.2839 + { 1.2840 + dst = dst_line; 1.2841 + dst_line += dst_stride; 1.2842 + mask = mask_line; 1.2843 + mask_line += mask_stride; 1.2844 + w = width; 1.2845 + 1.2846 + while (w && (uintptr_t)dst & 7) 1.2847 + { 1.2848 + uint16_t tmp; 1.2849 + uint8_t a; 1.2850 + uint32_t m, d; 1.2851 + 1.2852 + a = *mask++; 1.2853 + d = *dst; 1.2854 + 1.2855 + m = MUL_UN8 (sa, a, tmp); 1.2856 + d = MUL_UN8 (m, d, tmp); 1.2857 + 1.2858 + *dst++ = d; 1.2859 + w--; 1.2860 + } 1.2861 + 1.2862 + while (w >= 4) 1.2863 + { 1.2864 + __m64 vmask; 1.2865 + __m64 vdest; 1.2866 + 1.2867 + vmask = load8888u ((uint32_t *)mask); 1.2868 + vdest = load8888 ((uint32_t *)dst); 1.2869 + 1.2870 + store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); 1.2871 + 1.2872 + dst += 4; 1.2873 + mask += 4; 1.2874 + w -= 4; 1.2875 + } 1.2876 + 1.2877 + while (w--) 1.2878 + { 1.2879 + uint16_t tmp; 1.2880 + uint8_t a; 1.2881 + uint32_t m, d; 1.2882 + 1.2883 + a = *mask++; 1.2884 + d = *dst; 1.2885 + 1.2886 + m = MUL_UN8 (sa, a, tmp); 1.2887 + d = MUL_UN8 (m, d, tmp); 1.2888 + 1.2889 + *dst++ = d; 1.2890 + } 1.2891 + } 1.2892 + 1.2893 + _mm_empty (); 1.2894 +} 1.2895 + 1.2896 +static void 1.2897 +mmx_composite_in_8_8 (pixman_implementation_t *imp, 1.2898 + pixman_composite_info_t *info) 1.2899 +{ 1.2900 + PIXMAN_COMPOSITE_ARGS (info); 1.2901 + uint8_t *dst_line, *dst; 1.2902 + uint8_t *src_line, *src; 1.2903 + int src_stride, dst_stride; 1.2904 + int32_t w; 1.2905 + 1.2906 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.2907 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 1.2908 + 1.2909 + while (height--) 1.2910 + { 1.2911 + dst = dst_line; 1.2912 + dst_line += dst_stride; 1.2913 + src = src_line; 1.2914 + src_line += src_stride; 1.2915 + w = width; 1.2916 + 1.2917 + while (w && (uintptr_t)dst & 3) 1.2918 + { 1.2919 + uint8_t s, d; 1.2920 + uint16_t tmp; 1.2921 + 1.2922 + s = *src; 1.2923 + d = *dst; 1.2924 + 1.2925 + *dst = MUL_UN8 (s, d, tmp); 1.2926 + 1.2927 + src++; 1.2928 + dst++; 1.2929 + w--; 1.2930 + } 1.2931 + 1.2932 + while (w >= 4) 1.2933 + { 1.2934 + uint32_t *s = (uint32_t *)src; 1.2935 + uint32_t *d = (uint32_t *)dst; 1.2936 + 1.2937 + store8888 (d, in (load8888u (s), load8888 (d))); 1.2938 + 1.2939 + w -= 4; 1.2940 + dst += 4; 1.2941 + src += 4; 1.2942 + } 1.2943 + 1.2944 + while (w--) 1.2945 + { 1.2946 + uint8_t s, d; 1.2947 + uint16_t tmp; 1.2948 + 1.2949 + s = *src; 1.2950 + d = *dst; 1.2951 + 1.2952 + *dst = MUL_UN8 (s, d, tmp); 1.2953 + 1.2954 + src++; 1.2955 + dst++; 1.2956 + } 1.2957 + } 1.2958 + 1.2959 + _mm_empty (); 1.2960 +} 1.2961 + 1.2962 +static void 1.2963 +mmx_composite_add_n_8_8 (pixman_implementation_t *imp, 1.2964 + pixman_composite_info_t *info) 1.2965 +{ 1.2966 + PIXMAN_COMPOSITE_ARGS (info); 1.2967 + uint8_t *dst_line, *dst; 1.2968 + uint8_t *mask_line, *mask; 1.2969 + int dst_stride, mask_stride; 1.2970 + int32_t w; 1.2971 + uint32_t src; 1.2972 + uint8_t sa; 1.2973 + __m64 vsrc, vsrca; 1.2974 + 1.2975 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.2976 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.2977 + 1.2978 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.2979 + 1.2980 + sa = src >> 24; 1.2981 + 1.2982 + if (src == 0) 1.2983 + return; 1.2984 + 1.2985 + vsrc = load8888 (&src); 1.2986 + vsrca = expand_alpha (vsrc); 1.2987 + 1.2988 + while (height--) 1.2989 + { 1.2990 + dst = dst_line; 1.2991 + dst_line += dst_stride; 1.2992 + mask = mask_line; 1.2993 + mask_line += mask_stride; 1.2994 + w = width; 1.2995 + 1.2996 + while (w && (uintptr_t)dst & 3) 1.2997 + { 1.2998 + uint16_t tmp; 1.2999 + uint16_t a; 1.3000 + uint32_t m, d; 1.3001 + uint32_t r; 1.3002 + 1.3003 + a = *mask++; 1.3004 + d = *dst; 1.3005 + 1.3006 + m = MUL_UN8 (sa, a, tmp); 1.3007 + r = ADD_UN8 (m, d, tmp); 1.3008 + 1.3009 + *dst++ = r; 1.3010 + w--; 1.3011 + } 1.3012 + 1.3013 + while (w >= 4) 1.3014 + { 1.3015 + __m64 vmask; 1.3016 + __m64 vdest; 1.3017 + 1.3018 + vmask = load8888u ((uint32_t *)mask); 1.3019 + vdest = load8888 ((uint32_t *)dst); 1.3020 + 1.3021 + store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); 1.3022 + 1.3023 + dst += 4; 1.3024 + mask += 4; 1.3025 + w -= 4; 1.3026 + } 1.3027 + 1.3028 + while (w--) 1.3029 + { 1.3030 + uint16_t tmp; 1.3031 + uint16_t a; 1.3032 + uint32_t m, d; 1.3033 + uint32_t r; 1.3034 + 1.3035 + a = *mask++; 1.3036 + d = *dst; 1.3037 + 1.3038 + m = MUL_UN8 (sa, a, tmp); 1.3039 + r = ADD_UN8 (m, d, tmp); 1.3040 + 1.3041 + *dst++ = r; 1.3042 + } 1.3043 + } 1.3044 + 1.3045 + _mm_empty (); 1.3046 +} 1.3047 + 1.3048 +static void 1.3049 +mmx_composite_add_8_8 (pixman_implementation_t *imp, 1.3050 + pixman_composite_info_t *info) 1.3051 +{ 1.3052 + PIXMAN_COMPOSITE_ARGS (info); 1.3053 + uint8_t *dst_line, *dst; 1.3054 + uint8_t *src_line, *src; 1.3055 + int dst_stride, src_stride; 1.3056 + int32_t w; 1.3057 + uint8_t s, d; 1.3058 + uint16_t t; 1.3059 + 1.3060 + CHECKPOINT (); 1.3061 + 1.3062 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); 1.3063 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); 1.3064 + 1.3065 + while (height--) 1.3066 + { 1.3067 + dst = dst_line; 1.3068 + dst_line += dst_stride; 1.3069 + src = src_line; 1.3070 + src_line += src_stride; 1.3071 + w = width; 1.3072 + 1.3073 + while (w && (uintptr_t)dst & 7) 1.3074 + { 1.3075 + s = *src; 1.3076 + d = *dst; 1.3077 + t = d + s; 1.3078 + s = t | (0 - (t >> 8)); 1.3079 + *dst = s; 1.3080 + 1.3081 + dst++; 1.3082 + src++; 1.3083 + w--; 1.3084 + } 1.3085 + 1.3086 + while (w >= 8) 1.3087 + { 1.3088 + *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); 1.3089 + dst += 8; 1.3090 + src += 8; 1.3091 + w -= 8; 1.3092 + } 1.3093 + 1.3094 + while (w) 1.3095 + { 1.3096 + s = *src; 1.3097 + d = *dst; 1.3098 + t = d + s; 1.3099 + s = t | (0 - (t >> 8)); 1.3100 + *dst = s; 1.3101 + 1.3102 + dst++; 1.3103 + src++; 1.3104 + w--; 1.3105 + } 1.3106 + } 1.3107 + 1.3108 + _mm_empty (); 1.3109 +} 1.3110 + 1.3111 +static void 1.3112 +mmx_composite_add_0565_0565 (pixman_implementation_t *imp, 1.3113 + pixman_composite_info_t *info) 1.3114 +{ 1.3115 + PIXMAN_COMPOSITE_ARGS (info); 1.3116 + uint16_t *dst_line, *dst; 1.3117 + uint32_t d; 1.3118 + uint16_t *src_line, *src; 1.3119 + uint32_t s; 1.3120 + int dst_stride, src_stride; 1.3121 + int32_t w; 1.3122 + 1.3123 + CHECKPOINT (); 1.3124 + 1.3125 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); 1.3126 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); 1.3127 + 1.3128 + while (height--) 1.3129 + { 1.3130 + dst = dst_line; 1.3131 + dst_line += dst_stride; 1.3132 + src = src_line; 1.3133 + src_line += src_stride; 1.3134 + w = width; 1.3135 + 1.3136 + while (w && (uintptr_t)dst & 7) 1.3137 + { 1.3138 + s = *src++; 1.3139 + if (s) 1.3140 + { 1.3141 + d = *dst; 1.3142 + s = convert_0565_to_8888 (s); 1.3143 + if (d) 1.3144 + { 1.3145 + d = convert_0565_to_8888 (d); 1.3146 + UN8x4_ADD_UN8x4 (s, d); 1.3147 + } 1.3148 + *dst = convert_8888_to_0565 (s); 1.3149 + } 1.3150 + dst++; 1.3151 + w--; 1.3152 + } 1.3153 + 1.3154 + while (w >= 4) 1.3155 + { 1.3156 + __m64 vdest = *(__m64 *)dst; 1.3157 + __m64 vsrc = ldq_u ((__m64 *)src); 1.3158 + __m64 vd0, vd1; 1.3159 + __m64 vs0, vs1; 1.3160 + 1.3161 + expand_4xpacked565 (vdest, &vd0, &vd1, 0); 1.3162 + expand_4xpacked565 (vsrc, &vs0, &vs1, 0); 1.3163 + 1.3164 + vd0 = _mm_adds_pu8 (vd0, vs0); 1.3165 + vd1 = _mm_adds_pu8 (vd1, vs1); 1.3166 + 1.3167 + *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); 1.3168 + 1.3169 + dst += 4; 1.3170 + src += 4; 1.3171 + w -= 4; 1.3172 + } 1.3173 + 1.3174 + while (w--) 1.3175 + { 1.3176 + s = *src++; 1.3177 + if (s) 1.3178 + { 1.3179 + d = *dst; 1.3180 + s = convert_0565_to_8888 (s); 1.3181 + if (d) 1.3182 + { 1.3183 + d = convert_0565_to_8888 (d); 1.3184 + UN8x4_ADD_UN8x4 (s, d); 1.3185 + } 1.3186 + *dst = convert_8888_to_0565 (s); 1.3187 + } 1.3188 + dst++; 1.3189 + } 1.3190 + } 1.3191 + 1.3192 + _mm_empty (); 1.3193 +} 1.3194 + 1.3195 +static void 1.3196 +mmx_composite_add_8888_8888 (pixman_implementation_t *imp, 1.3197 + pixman_composite_info_t *info) 1.3198 +{ 1.3199 + PIXMAN_COMPOSITE_ARGS (info); 1.3200 + uint32_t *dst_line, *dst; 1.3201 + uint32_t *src_line, *src; 1.3202 + int dst_stride, src_stride; 1.3203 + int32_t w; 1.3204 + 1.3205 + CHECKPOINT (); 1.3206 + 1.3207 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.3208 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3209 + 1.3210 + while (height--) 1.3211 + { 1.3212 + dst = dst_line; 1.3213 + dst_line += dst_stride; 1.3214 + src = src_line; 1.3215 + src_line += src_stride; 1.3216 + w = width; 1.3217 + 1.3218 + while (w && (uintptr_t)dst & 7) 1.3219 + { 1.3220 + store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), 1.3221 + load ((const uint32_t *)dst))); 1.3222 + dst++; 1.3223 + src++; 1.3224 + w--; 1.3225 + } 1.3226 + 1.3227 + while (w >= 2) 1.3228 + { 1.3229 + *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); 1.3230 + dst += 2; 1.3231 + src += 2; 1.3232 + w -= 2; 1.3233 + } 1.3234 + 1.3235 + if (w) 1.3236 + { 1.3237 + store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), 1.3238 + load ((const uint32_t *)dst))); 1.3239 + 1.3240 + } 1.3241 + } 1.3242 + 1.3243 + _mm_empty (); 1.3244 +} 1.3245 + 1.3246 +static pixman_bool_t 1.3247 +mmx_blt (pixman_implementation_t *imp, 1.3248 + uint32_t * src_bits, 1.3249 + uint32_t * dst_bits, 1.3250 + int src_stride, 1.3251 + int dst_stride, 1.3252 + int src_bpp, 1.3253 + int dst_bpp, 1.3254 + int src_x, 1.3255 + int src_y, 1.3256 + int dest_x, 1.3257 + int dest_y, 1.3258 + int width, 1.3259 + int height) 1.3260 +{ 1.3261 + uint8_t * src_bytes; 1.3262 + uint8_t * dst_bytes; 1.3263 + int byte_width; 1.3264 + 1.3265 + if (src_bpp != dst_bpp) 1.3266 + return FALSE; 1.3267 + 1.3268 + if (src_bpp == 16) 1.3269 + { 1.3270 + src_stride = src_stride * (int) sizeof (uint32_t) / 2; 1.3271 + dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; 1.3272 + src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); 1.3273 + dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 1.3274 + byte_width = 2 * width; 1.3275 + src_stride *= 2; 1.3276 + dst_stride *= 2; 1.3277 + } 1.3278 + else if (src_bpp == 32) 1.3279 + { 1.3280 + src_stride = src_stride * (int) sizeof (uint32_t) / 4; 1.3281 + dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; 1.3282 + src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); 1.3283 + dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); 1.3284 + byte_width = 4 * width; 1.3285 + src_stride *= 4; 1.3286 + dst_stride *= 4; 1.3287 + } 1.3288 + else 1.3289 + { 1.3290 + return FALSE; 1.3291 + } 1.3292 + 1.3293 + while (height--) 1.3294 + { 1.3295 + int w; 1.3296 + uint8_t *s = src_bytes; 1.3297 + uint8_t *d = dst_bytes; 1.3298 + src_bytes += src_stride; 1.3299 + dst_bytes += dst_stride; 1.3300 + w = byte_width; 1.3301 + 1.3302 + if (w >= 1 && ((uintptr_t)d & 1)) 1.3303 + { 1.3304 + *(uint8_t *)d = *(uint8_t *)s; 1.3305 + w -= 1; 1.3306 + s += 1; 1.3307 + d += 1; 1.3308 + } 1.3309 + 1.3310 + if (w >= 2 && ((uintptr_t)d & 3)) 1.3311 + { 1.3312 + *(uint16_t *)d = *(uint16_t *)s; 1.3313 + w -= 2; 1.3314 + s += 2; 1.3315 + d += 2; 1.3316 + } 1.3317 + 1.3318 + while (w >= 4 && ((uintptr_t)d & 7)) 1.3319 + { 1.3320 + *(uint32_t *)d = ldl_u ((uint32_t *)s); 1.3321 + 1.3322 + w -= 4; 1.3323 + s += 4; 1.3324 + d += 4; 1.3325 + } 1.3326 + 1.3327 + while (w >= 64) 1.3328 + { 1.3329 +#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX 1.3330 + __asm__ ( 1.3331 + "movq (%1), %%mm0\n" 1.3332 + "movq 8(%1), %%mm1\n" 1.3333 + "movq 16(%1), %%mm2\n" 1.3334 + "movq 24(%1), %%mm3\n" 1.3335 + "movq 32(%1), %%mm4\n" 1.3336 + "movq 40(%1), %%mm5\n" 1.3337 + "movq 48(%1), %%mm6\n" 1.3338 + "movq 56(%1), %%mm7\n" 1.3339 + 1.3340 + "movq %%mm0, (%0)\n" 1.3341 + "movq %%mm1, 8(%0)\n" 1.3342 + "movq %%mm2, 16(%0)\n" 1.3343 + "movq %%mm3, 24(%0)\n" 1.3344 + "movq %%mm4, 32(%0)\n" 1.3345 + "movq %%mm5, 40(%0)\n" 1.3346 + "movq %%mm6, 48(%0)\n" 1.3347 + "movq %%mm7, 56(%0)\n" 1.3348 + : 1.3349 + : "r" (d), "r" (s) 1.3350 + : "memory", 1.3351 + "%mm0", "%mm1", "%mm2", "%mm3", 1.3352 + "%mm4", "%mm5", "%mm6", "%mm7"); 1.3353 +#else 1.3354 + __m64 v0 = ldq_u ((__m64 *)(s + 0)); 1.3355 + __m64 v1 = ldq_u ((__m64 *)(s + 8)); 1.3356 + __m64 v2 = ldq_u ((__m64 *)(s + 16)); 1.3357 + __m64 v3 = ldq_u ((__m64 *)(s + 24)); 1.3358 + __m64 v4 = ldq_u ((__m64 *)(s + 32)); 1.3359 + __m64 v5 = ldq_u ((__m64 *)(s + 40)); 1.3360 + __m64 v6 = ldq_u ((__m64 *)(s + 48)); 1.3361 + __m64 v7 = ldq_u ((__m64 *)(s + 56)); 1.3362 + *(__m64 *)(d + 0) = v0; 1.3363 + *(__m64 *)(d + 8) = v1; 1.3364 + *(__m64 *)(d + 16) = v2; 1.3365 + *(__m64 *)(d + 24) = v3; 1.3366 + *(__m64 *)(d + 32) = v4; 1.3367 + *(__m64 *)(d + 40) = v5; 1.3368 + *(__m64 *)(d + 48) = v6; 1.3369 + *(__m64 *)(d + 56) = v7; 1.3370 +#endif 1.3371 + 1.3372 + w -= 64; 1.3373 + s += 64; 1.3374 + d += 64; 1.3375 + } 1.3376 + while (w >= 4) 1.3377 + { 1.3378 + *(uint32_t *)d = ldl_u ((uint32_t *)s); 1.3379 + 1.3380 + w -= 4; 1.3381 + s += 4; 1.3382 + d += 4; 1.3383 + } 1.3384 + if (w >= 2) 1.3385 + { 1.3386 + *(uint16_t *)d = *(uint16_t *)s; 1.3387 + w -= 2; 1.3388 + s += 2; 1.3389 + d += 2; 1.3390 + } 1.3391 + } 1.3392 + 1.3393 + _mm_empty (); 1.3394 + 1.3395 + return TRUE; 1.3396 +} 1.3397 + 1.3398 +static void 1.3399 +mmx_composite_copy_area (pixman_implementation_t *imp, 1.3400 + pixman_composite_info_t *info) 1.3401 +{ 1.3402 + PIXMAN_COMPOSITE_ARGS (info); 1.3403 + 1.3404 + mmx_blt (imp, src_image->bits.bits, 1.3405 + dest_image->bits.bits, 1.3406 + src_image->bits.rowstride, 1.3407 + dest_image->bits.rowstride, 1.3408 + PIXMAN_FORMAT_BPP (src_image->bits.format), 1.3409 + PIXMAN_FORMAT_BPP (dest_image->bits.format), 1.3410 + src_x, src_y, dest_x, dest_y, width, height); 1.3411 +} 1.3412 + 1.3413 +static void 1.3414 +mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, 1.3415 + pixman_composite_info_t *info) 1.3416 +{ 1.3417 + PIXMAN_COMPOSITE_ARGS (info); 1.3418 + uint32_t *src, *src_line; 1.3419 + uint32_t *dst, *dst_line; 1.3420 + uint8_t *mask, *mask_line; 1.3421 + int src_stride, mask_stride, dst_stride; 1.3422 + int32_t w; 1.3423 + 1.3424 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3425 + PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); 1.3426 + PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); 1.3427 + 1.3428 + while (height--) 1.3429 + { 1.3430 + src = src_line; 1.3431 + src_line += src_stride; 1.3432 + dst = dst_line; 1.3433 + dst_line += dst_stride; 1.3434 + mask = mask_line; 1.3435 + mask_line += mask_stride; 1.3436 + 1.3437 + w = width; 1.3438 + 1.3439 + while (w--) 1.3440 + { 1.3441 + uint64_t m = *mask; 1.3442 + 1.3443 + if (m) 1.3444 + { 1.3445 + uint32_t ssrc = *src | 0xff000000; 1.3446 + __m64 s = load8888 (&ssrc); 1.3447 + 1.3448 + if (m == 0xff) 1.3449 + { 1.3450 + store8888 (dst, s); 1.3451 + } 1.3452 + else 1.3453 + { 1.3454 + __m64 sa = expand_alpha (s); 1.3455 + __m64 vm = expand_alpha_rev (to_m64 (m)); 1.3456 + __m64 vdest = in_over (s, sa, vm, load8888 (dst)); 1.3457 + 1.3458 + store8888 (dst, vdest); 1.3459 + } 1.3460 + } 1.3461 + 1.3462 + mask++; 1.3463 + dst++; 1.3464 + src++; 1.3465 + } 1.3466 + } 1.3467 + 1.3468 + _mm_empty (); 1.3469 +} 1.3470 + 1.3471 +static void 1.3472 +mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, 1.3473 + pixman_composite_info_t *info) 1.3474 +{ 1.3475 + PIXMAN_COMPOSITE_ARGS (info); 1.3476 + uint32_t src; 1.3477 + uint32_t *dst_line, *dst; 1.3478 + int32_t w; 1.3479 + int dst_stride; 1.3480 + __m64 vsrc; 1.3481 + 1.3482 + CHECKPOINT (); 1.3483 + 1.3484 + src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); 1.3485 + 1.3486 + if (src == 0) 1.3487 + return; 1.3488 + 1.3489 + PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); 1.3490 + 1.3491 + vsrc = load8888 (&src); 1.3492 + 1.3493 + while (height--) 1.3494 + { 1.3495 + dst = dst_line; 1.3496 + dst_line += dst_stride; 1.3497 + w = width; 1.3498 + 1.3499 + CHECKPOINT (); 1.3500 + 1.3501 + while (w && (uintptr_t)dst & 7) 1.3502 + { 1.3503 + __m64 vdest = load8888 (dst); 1.3504 + 1.3505 + store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); 1.3506 + 1.3507 + w--; 1.3508 + dst++; 1.3509 + } 1.3510 + 1.3511 + while (w >= 2) 1.3512 + { 1.3513 + __m64 vdest = *(__m64 *)dst; 1.3514 + __m64 dest0 = expand8888 (vdest, 0); 1.3515 + __m64 dest1 = expand8888 (vdest, 1); 1.3516 + 1.3517 + 1.3518 + dest0 = over (dest0, expand_alpha (dest0), vsrc); 1.3519 + dest1 = over (dest1, expand_alpha (dest1), vsrc); 1.3520 + 1.3521 + *(__m64 *)dst = pack8888 (dest0, dest1); 1.3522 + 1.3523 + dst += 2; 1.3524 + w -= 2; 1.3525 + } 1.3526 + 1.3527 + CHECKPOINT (); 1.3528 + 1.3529 + if (w) 1.3530 + { 1.3531 + __m64 vdest = load8888 (dst); 1.3532 + 1.3533 + store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); 1.3534 + } 1.3535 + } 1.3536 + 1.3537 + _mm_empty (); 1.3538 +} 1.3539 + 1.3540 +#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) 1.3541 +#define BMSK (BSHIFT - 1) 1.3542 + 1.3543 +#define BILINEAR_DECLARE_VARIABLES \ 1.3544 + const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ 1.3545 + const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ 1.3546 + const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \ 1.3547 + const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ 1.3548 + const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ 1.3549 + const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ 1.3550 + const __m64 mm_zero = _mm_setzero_si64 (); \ 1.3551 + __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) 1.3552 + 1.3553 +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ 1.3554 +do { \ 1.3555 + /* fetch 2x2 pixel block into 2 mmx registers */ \ 1.3556 + __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ 1.3557 + __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ 1.3558 + /* vertical interpolation */ \ 1.3559 + __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ 1.3560 + __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ 1.3561 + __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ 1.3562 + __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ 1.3563 + __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ 1.3564 + __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ 1.3565 + vx += unit_x; \ 1.3566 + if (BILINEAR_INTERPOLATION_BITS < 8) \ 1.3567 + { \ 1.3568 + /* calculate horizontal weights */ \ 1.3569 + __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ 1.3570 + _mm_srli_pi16 (mm_x, \ 1.3571 + 16 - BILINEAR_INTERPOLATION_BITS))); \ 1.3572 + /* horizontal interpolation */ \ 1.3573 + __m64 p = _mm_unpacklo_pi16 (lo, hi); \ 1.3574 + __m64 q = _mm_unpackhi_pi16 (lo, hi); \ 1.3575 + lo = _mm_madd_pi16 (p, mm_wh); \ 1.3576 + hi = _mm_madd_pi16 (q, mm_wh); \ 1.3577 + } \ 1.3578 + else \ 1.3579 + { \ 1.3580 + /* calculate horizontal weights */ \ 1.3581 + __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ 1.3582 + 16 - BILINEAR_INTERPOLATION_BITS)); \ 1.3583 + __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ 1.3584 + 16 - BILINEAR_INTERPOLATION_BITS); \ 1.3585 + /* horizontal interpolation */ \ 1.3586 + __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ 1.3587 + __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ 1.3588 + __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ 1.3589 + __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ 1.3590 + lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ 1.3591 + _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ 1.3592 + hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ 1.3593 + _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ 1.3594 + } \ 1.3595 + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ 1.3596 + /* shift and pack the result */ \ 1.3597 + hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ 1.3598 + lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ 1.3599 + lo = _mm_packs_pi32 (lo, hi); \ 1.3600 + lo = _mm_packs_pu16 (lo, lo); \ 1.3601 + pix = lo; \ 1.3602 +} while (0) 1.3603 + 1.3604 +#define BILINEAR_SKIP_ONE_PIXEL() \ 1.3605 +do { \ 1.3606 + vx += unit_x; \ 1.3607 + mm_x = _mm_add_pi16 (mm_x, mm_ux); \ 1.3608 +} while(0) 1.3609 + 1.3610 +static force_inline void 1.3611 +scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, 1.3612 + const uint32_t * mask, 1.3613 + const uint32_t * src_top, 1.3614 + const uint32_t * src_bottom, 1.3615 + int32_t w, 1.3616 + int wt, 1.3617 + int wb, 1.3618 + pixman_fixed_t vx, 1.3619 + pixman_fixed_t unit_x, 1.3620 + pixman_fixed_t max_vx, 1.3621 + pixman_bool_t zero_src) 1.3622 +{ 1.3623 + BILINEAR_DECLARE_VARIABLES; 1.3624 + __m64 pix; 1.3625 + 1.3626 + while (w--) 1.3627 + { 1.3628 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix); 1.3629 + store (dst, pix); 1.3630 + dst++; 1.3631 + } 1.3632 + 1.3633 + _mm_empty (); 1.3634 +} 1.3635 + 1.3636 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, 1.3637 + scaled_bilinear_scanline_mmx_8888_8888_SRC, 1.3638 + uint32_t, uint32_t, uint32_t, 1.3639 + COVER, FLAG_NONE) 1.3640 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, 1.3641 + scaled_bilinear_scanline_mmx_8888_8888_SRC, 1.3642 + uint32_t, uint32_t, uint32_t, 1.3643 + PAD, FLAG_NONE) 1.3644 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, 1.3645 + scaled_bilinear_scanline_mmx_8888_8888_SRC, 1.3646 + uint32_t, uint32_t, uint32_t, 1.3647 + NONE, FLAG_NONE) 1.3648 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, 1.3649 + scaled_bilinear_scanline_mmx_8888_8888_SRC, 1.3650 + uint32_t, uint32_t, uint32_t, 1.3651 + NORMAL, FLAG_NONE) 1.3652 + 1.3653 +static force_inline void 1.3654 +scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, 1.3655 + const uint32_t * mask, 1.3656 + const uint32_t * src_top, 1.3657 + const uint32_t * src_bottom, 1.3658 + int32_t w, 1.3659 + int wt, 1.3660 + int wb, 1.3661 + pixman_fixed_t vx, 1.3662 + pixman_fixed_t unit_x, 1.3663 + pixman_fixed_t max_vx, 1.3664 + pixman_bool_t zero_src) 1.3665 +{ 1.3666 + BILINEAR_DECLARE_VARIABLES; 1.3667 + __m64 pix1, pix2; 1.3668 + 1.3669 + while (w) 1.3670 + { 1.3671 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.3672 + 1.3673 + if (!is_zero (pix1)) 1.3674 + { 1.3675 + pix2 = load (dst); 1.3676 + store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); 1.3677 + } 1.3678 + 1.3679 + w--; 1.3680 + dst++; 1.3681 + } 1.3682 + 1.3683 + _mm_empty (); 1.3684 +} 1.3685 + 1.3686 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, 1.3687 + scaled_bilinear_scanline_mmx_8888_8888_OVER, 1.3688 + uint32_t, uint32_t, uint32_t, 1.3689 + COVER, FLAG_NONE) 1.3690 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, 1.3691 + scaled_bilinear_scanline_mmx_8888_8888_OVER, 1.3692 + uint32_t, uint32_t, uint32_t, 1.3693 + PAD, FLAG_NONE) 1.3694 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, 1.3695 + scaled_bilinear_scanline_mmx_8888_8888_OVER, 1.3696 + uint32_t, uint32_t, uint32_t, 1.3697 + NONE, FLAG_NONE) 1.3698 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, 1.3699 + scaled_bilinear_scanline_mmx_8888_8888_OVER, 1.3700 + uint32_t, uint32_t, uint32_t, 1.3701 + NORMAL, FLAG_NONE) 1.3702 + 1.3703 +static force_inline void 1.3704 +scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, 1.3705 + const uint8_t * mask, 1.3706 + const uint32_t * src_top, 1.3707 + const uint32_t * src_bottom, 1.3708 + int32_t w, 1.3709 + int wt, 1.3710 + int wb, 1.3711 + pixman_fixed_t vx, 1.3712 + pixman_fixed_t unit_x, 1.3713 + pixman_fixed_t max_vx, 1.3714 + pixman_bool_t zero_src) 1.3715 +{ 1.3716 + BILINEAR_DECLARE_VARIABLES; 1.3717 + __m64 pix1, pix2; 1.3718 + uint32_t m; 1.3719 + 1.3720 + while (w) 1.3721 + { 1.3722 + m = (uint32_t) *mask++; 1.3723 + 1.3724 + if (m) 1.3725 + { 1.3726 + BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); 1.3727 + 1.3728 + if (m == 0xff && is_opaque (pix1)) 1.3729 + { 1.3730 + store (dst, pix1); 1.3731 + } 1.3732 + else 1.3733 + { 1.3734 + __m64 ms, md, ma, msa; 1.3735 + 1.3736 + pix2 = load (dst); 1.3737 + ma = expand_alpha_rev (to_m64 (m)); 1.3738 + ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); 1.3739 + md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); 1.3740 + 1.3741 + msa = expand_alpha (ms); 1.3742 + 1.3743 + store8888 (dst, (in_over (ms, msa, ma, md))); 1.3744 + } 1.3745 + } 1.3746 + else 1.3747 + { 1.3748 + BILINEAR_SKIP_ONE_PIXEL (); 1.3749 + } 1.3750 + 1.3751 + w--; 1.3752 + dst++; 1.3753 + } 1.3754 + 1.3755 + _mm_empty (); 1.3756 +} 1.3757 + 1.3758 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, 1.3759 + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 1.3760 + uint32_t, uint8_t, uint32_t, 1.3761 + COVER, FLAG_HAVE_NON_SOLID_MASK) 1.3762 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, 1.3763 + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 1.3764 + uint32_t, uint8_t, uint32_t, 1.3765 + PAD, FLAG_HAVE_NON_SOLID_MASK) 1.3766 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, 1.3767 + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 1.3768 + uint32_t, uint8_t, uint32_t, 1.3769 + NONE, FLAG_HAVE_NON_SOLID_MASK) 1.3770 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, 1.3771 + scaled_bilinear_scanline_mmx_8888_8_8888_OVER, 1.3772 + uint32_t, uint8_t, uint32_t, 1.3773 + NORMAL, FLAG_HAVE_NON_SOLID_MASK) 1.3774 + 1.3775 +static uint32_t * 1.3776 +mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) 1.3777 +{ 1.3778 + int w = iter->width; 1.3779 + uint32_t *dst = iter->buffer; 1.3780 + uint32_t *src = (uint32_t *)iter->bits; 1.3781 + 1.3782 + iter->bits += iter->stride; 1.3783 + 1.3784 + while (w && ((uintptr_t)dst) & 7) 1.3785 + { 1.3786 + *dst++ = (*src++) | 0xff000000; 1.3787 + w--; 1.3788 + } 1.3789 + 1.3790 + while (w >= 8) 1.3791 + { 1.3792 + __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); 1.3793 + __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); 1.3794 + __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); 1.3795 + __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); 1.3796 + 1.3797 + *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); 1.3798 + *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); 1.3799 + *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); 1.3800 + *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); 1.3801 + 1.3802 + dst += 8; 1.3803 + src += 8; 1.3804 + w -= 8; 1.3805 + } 1.3806 + 1.3807 + while (w) 1.3808 + { 1.3809 + *dst++ = (*src++) | 0xff000000; 1.3810 + w--; 1.3811 + } 1.3812 + 1.3813 + _mm_empty (); 1.3814 + return iter->buffer; 1.3815 +} 1.3816 + 1.3817 +static uint32_t * 1.3818 +mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) 1.3819 +{ 1.3820 + int w = iter->width; 1.3821 + uint32_t *dst = iter->buffer; 1.3822 + uint16_t *src = (uint16_t *)iter->bits; 1.3823 + 1.3824 + iter->bits += iter->stride; 1.3825 + 1.3826 + while (w && ((uintptr_t)dst) & 0x0f) 1.3827 + { 1.3828 + uint16_t s = *src++; 1.3829 + 1.3830 + *dst++ = convert_0565_to_8888 (s); 1.3831 + w--; 1.3832 + } 1.3833 + 1.3834 + while (w >= 4) 1.3835 + { 1.3836 + __m64 vsrc = ldq_u ((__m64 *)src); 1.3837 + __m64 mm0, mm1; 1.3838 + 1.3839 + expand_4xpacked565 (vsrc, &mm0, &mm1, 1); 1.3840 + 1.3841 + *(__m64 *)(dst + 0) = mm0; 1.3842 + *(__m64 *)(dst + 2) = mm1; 1.3843 + 1.3844 + dst += 4; 1.3845 + src += 4; 1.3846 + w -= 4; 1.3847 + } 1.3848 + 1.3849 + while (w) 1.3850 + { 1.3851 + uint16_t s = *src++; 1.3852 + 1.3853 + *dst++ = convert_0565_to_8888 (s); 1.3854 + w--; 1.3855 + } 1.3856 + 1.3857 + _mm_empty (); 1.3858 + return iter->buffer; 1.3859 +} 1.3860 + 1.3861 +static uint32_t * 1.3862 +mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) 1.3863 +{ 1.3864 + int w = iter->width; 1.3865 + uint32_t *dst = iter->buffer; 1.3866 + uint8_t *src = iter->bits; 1.3867 + 1.3868 + iter->bits += iter->stride; 1.3869 + 1.3870 + while (w && (((uintptr_t)dst) & 15)) 1.3871 + { 1.3872 + *dst++ = *(src++) << 24; 1.3873 + w--; 1.3874 + } 1.3875 + 1.3876 + while (w >= 8) 1.3877 + { 1.3878 + __m64 mm0 = ldq_u ((__m64 *)src); 1.3879 + 1.3880 + __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); 1.3881 + __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); 1.3882 + __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); 1.3883 + __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); 1.3884 + __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); 1.3885 + __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); 1.3886 + 1.3887 + *(__m64 *)(dst + 0) = mm3; 1.3888 + *(__m64 *)(dst + 2) = mm4; 1.3889 + *(__m64 *)(dst + 4) = mm5; 1.3890 + *(__m64 *)(dst + 6) = mm6; 1.3891 + 1.3892 + dst += 8; 1.3893 + src += 8; 1.3894 + w -= 8; 1.3895 + } 1.3896 + 1.3897 + while (w) 1.3898 + { 1.3899 + *dst++ = *(src++) << 24; 1.3900 + w--; 1.3901 + } 1.3902 + 1.3903 + _mm_empty (); 1.3904 + return iter->buffer; 1.3905 +} 1.3906 + 1.3907 +typedef struct 1.3908 +{ 1.3909 + pixman_format_code_t format; 1.3910 + pixman_iter_get_scanline_t get_scanline; 1.3911 +} fetcher_info_t; 1.3912 + 1.3913 +static const fetcher_info_t fetchers[] = 1.3914 +{ 1.3915 + { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 }, 1.3916 + { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 }, 1.3917 + { PIXMAN_a8, mmx_fetch_a8 }, 1.3918 + { PIXMAN_null } 1.3919 +}; 1.3920 + 1.3921 +static pixman_bool_t 1.3922 +mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) 1.3923 +{ 1.3924 + pixman_image_t *image = iter->image; 1.3925 + 1.3926 +#define FLAGS \ 1.3927 + (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ 1.3928 + FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) 1.3929 + 1.3930 + if ((iter->iter_flags & ITER_NARROW) && 1.3931 + (iter->image_flags & FLAGS) == FLAGS) 1.3932 + { 1.3933 + const fetcher_info_t *f; 1.3934 + 1.3935 + for (f = &fetchers[0]; f->format != PIXMAN_null; f++) 1.3936 + { 1.3937 + if (image->common.extended_format_code == f->format) 1.3938 + { 1.3939 + uint8_t *b = (uint8_t *)image->bits.bits; 1.3940 + int s = image->bits.rowstride * 4; 1.3941 + 1.3942 + iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; 1.3943 + iter->stride = s; 1.3944 + 1.3945 + iter->get_scanline = f->get_scanline; 1.3946 + return TRUE; 1.3947 + } 1.3948 + } 1.3949 + } 1.3950 + 1.3951 + return FALSE; 1.3952 +} 1.3953 + 1.3954 +static const pixman_fast_path_t mmx_fast_paths[] = 1.3955 +{ 1.3956 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), 1.3957 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), 1.3958 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), 1.3959 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), 1.3960 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), 1.3961 + PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), 1.3962 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), 1.3963 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), 1.3964 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), 1.3965 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), 1.3966 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), 1.3967 + PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), 1.3968 + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), 1.3969 + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), 1.3970 + PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), 1.3971 + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), 1.3972 + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), 1.3973 + PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), 1.3974 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), 1.3975 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), 1.3976 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), 1.3977 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), 1.3978 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), 1.3979 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), 1.3980 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), 1.3981 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), 1.3982 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), 1.3983 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), 1.3984 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), 1.3985 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), 1.3986 + PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), 1.3987 + PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), 1.3988 + PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), 1.3989 + PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), 1.3990 + PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 1.3991 + PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 1.3992 + 1.3993 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), 1.3994 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), 1.3995 + PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), 1.3996 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), 1.3997 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), 1.3998 + PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), 1.3999 + 1.4000 + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), 1.4001 + PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), 1.4002 + 1.4003 + PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), 1.4004 + PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), 1.4005 + PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), 1.4006 + PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), 1.4007 + PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), 1.4008 + PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), 1.4009 + 1.4010 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), 1.4011 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), 1.4012 + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), 1.4013 + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), 1.4014 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), 1.4015 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), 1.4016 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), 1.4017 + PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), 1.4018 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), 1.4019 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), 1.4020 + PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 1.4021 + PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 1.4022 + PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), 1.4023 + PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), 1.4024 + PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), 1.4025 + PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), 1.4026 + 1.4027 + PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), 1.4028 + PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), 1.4029 + 1.4030 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 1.4031 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 1.4032 + SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 1.4033 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 1.4034 + SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 1.4035 + SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 1.4036 + 1.4037 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), 1.4038 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), 1.4039 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), 1.4040 + SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), 1.4041 + 1.4042 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), 1.4043 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), 1.4044 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), 1.4045 + SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), 1.4046 + 1.4047 + { PIXMAN_OP_NONE }, 1.4048 +}; 1.4049 + 1.4050 +pixman_implementation_t * 1.4051 +_pixman_implementation_create_mmx (pixman_implementation_t *fallback) 1.4052 +{ 1.4053 + pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); 1.4054 + 1.4055 + imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; 1.4056 + imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; 1.4057 + imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; 1.4058 + imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; 1.4059 + imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; 1.4060 + imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; 1.4061 + imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; 1.4062 + imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; 1.4063 + imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; 1.4064 + imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; 1.4065 + imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; 1.4066 + 1.4067 + imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; 1.4068 + imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; 1.4069 + imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; 1.4070 + imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; 1.4071 + imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; 1.4072 + imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; 1.4073 + imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; 1.4074 + imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; 1.4075 + imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; 1.4076 + imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; 1.4077 + imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; 1.4078 + 1.4079 + imp->blt = mmx_blt; 1.4080 + imp->fill = mmx_fill; 1.4081 + 1.4082 + imp->src_iter_init = mmx_src_iter_init; 1.4083 + 1.4084 + return imp; 1.4085 +} 1.4086 + 1.4087 +#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */