gfx/cairo/libpixman/src/pixman-mmx.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/cairo/libpixman/src/pixman-mmx.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,4084 @@
     1.4 +/*
     1.5 + * Copyright © 2004, 2005 Red Hat, Inc.
     1.6 + * Copyright © 2004 Nicholas Miell
     1.7 + * Copyright © 2005 Trolltech AS
     1.8 + *
     1.9 + * Permission to use, copy, modify, distribute, and sell this software and its
    1.10 + * documentation for any purpose is hereby granted without fee, provided that
    1.11 + * the above copyright notice appear in all copies and that both that
    1.12 + * copyright notice and this permission notice appear in supporting
    1.13 + * documentation, and that the name of Red Hat not be used in advertising or
    1.14 + * publicity pertaining to distribution of the software without specific,
    1.15 + * written prior permission.  Red Hat makes no representations about the
    1.16 + * suitability of this software for any purpose.  It is provided "as is"
    1.17 + * without express or implied warranty.
    1.18 + *
    1.19 + * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    1.20 + * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    1.21 + * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    1.22 + * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    1.23 + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    1.24 + * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    1.25 + * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    1.26 + * SOFTWARE.
    1.27 + *
    1.28 + * Author:  Søren Sandmann (sandmann@redhat.com)
    1.29 + * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
    1.30 + * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
    1.31 + *
    1.32 + * Based on work by Owen Taylor
    1.33 + */
    1.34 +
    1.35 +#ifdef HAVE_CONFIG_H
    1.36 +#include <config.h>
    1.37 +#endif
    1.38 +
    1.39 +#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
    1.40 +
    1.41 +#ifdef USE_LOONGSON_MMI
    1.42 +#include <loongson-mmintrin.h>
    1.43 +#else
    1.44 +#include <mmintrin.h>
    1.45 +#endif
    1.46 +#include "pixman-private.h"
    1.47 +#include "pixman-combine32.h"
    1.48 +#include "pixman-inlines.h"
    1.49 +
    1.50 +#define no_vERBOSE
    1.51 +
    1.52 +#ifdef VERBOSE
    1.53 +#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
    1.54 +#else
    1.55 +#define CHECKPOINT()
    1.56 +#endif
    1.57 +
    1.58 +#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
    1.59 +/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
    1.60 +extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.61 +_mm_empty (void)
    1.62 +{
    1.63 +
    1.64 +}
    1.65 +#endif
    1.66 +
    1.67 +#ifdef USE_X86_MMX
    1.68 +# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
    1.69 +#  include <xmmintrin.h>
    1.70 +# else
    1.71 +/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
    1.72 + * instructions to be generated that we don't want. Just duplicate the
    1.73 + * functions we want to use.  */
    1.74 +extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.75 +_mm_movemask_pi8 (__m64 __A)
    1.76 +{
    1.77 +    int ret;
    1.78 +
    1.79 +    asm ("pmovmskb %1, %0\n\t"
    1.80 +	: "=r" (ret)
    1.81 +	: "y" (__A)
    1.82 +    );
    1.83 +
    1.84 +    return ret;
    1.85 +}
    1.86 +
    1.87 +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.88 +_mm_mulhi_pu16 (__m64 __A, __m64 __B)
    1.89 +{
    1.90 +    asm ("pmulhuw %1, %0\n\t"
    1.91 +	: "+y" (__A)
    1.92 +	: "y" (__B)
    1.93 +    );
    1.94 +    return __A;
    1.95 +}
    1.96 +
    1.97 +#  ifdef __OPTIMIZE__
    1.98 +extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
    1.99 +_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
   1.100 +{
   1.101 +    __m64 ret;
   1.102 +
   1.103 +    asm ("pshufw %2, %1, %0\n\t"
   1.104 +	: "=y" (ret)
   1.105 +	: "y" (__A), "K" (__N)
   1.106 +    );
   1.107 +
   1.108 +    return ret;
   1.109 +}
   1.110 +#  else
   1.111 +#   define _mm_shuffle_pi16(A, N)					\
   1.112 +    ({									\
   1.113 +	__m64 ret;							\
   1.114 +									\
   1.115 +	asm ("pshufw %2, %1, %0\n\t"					\
   1.116 +	     : "=y" (ret)						\
   1.117 +	     : "y" (A), "K" ((const int8_t)N)				\
   1.118 +	);								\
   1.119 +									\
   1.120 +	ret;								\
   1.121 +    })
   1.122 +#  endif
   1.123 +# endif
   1.124 +#endif
   1.125 +
   1.126 +#ifndef _MSC_VER
   1.127 +#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
   1.128 + (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
   1.129 +#endif
   1.130 +
   1.131 +/* Notes about writing mmx code
   1.132 + *
   1.133 + * give memory operands as the second operand. If you give it as the
   1.134 + * first, gcc will first load it into a register, then use that
   1.135 + * register
   1.136 + *
   1.137 + *   ie. use
   1.138 + *
   1.139 + *         _mm_mullo_pi16 (x, mmx_constant);
   1.140 + *
   1.141 + *   not
   1.142 + *
   1.143 + *         _mm_mullo_pi16 (mmx_constant, x);
   1.144 + *
   1.145 + * Also try to minimize dependencies. i.e. when you need a value, try
   1.146 + * to calculate it from a value that was calculated as early as
   1.147 + * possible.
   1.148 + */
   1.149 +
   1.150 +/* --------------- MMX primitives ------------------------------------- */
   1.151 +
   1.152 +/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
   1.153 + * the name of the member used to access the data.
   1.154 + * If __m64 requires using mm_cvt* intrinsics functions to convert between
   1.155 + * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
   1.156 + * If __m64 and uint64_t values can just be cast to each other directly,
   1.157 + * then define USE_M64_CASTS.
   1.158 + * If __m64 is a double datatype, then define USE_M64_DOUBLE.
   1.159 + */
   1.160 +#ifdef _MSC_VER
   1.161 +# define M64_MEMBER m64_u64
   1.162 +#elif defined(__ICC)
   1.163 +# define USE_CVT_INTRINSICS
   1.164 +#elif defined(USE_LOONGSON_MMI)
   1.165 +# define USE_M64_DOUBLE
   1.166 +#elif defined(__GNUC__)
   1.167 +# define USE_M64_CASTS
   1.168 +#elif defined(__SUNPRO_C)
   1.169 +# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
   1.170 +/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
   1.171 + * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
   1.172 + * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
   1.173 + */
   1.174 +#  define USE_CVT_INTRINSICS
   1.175 +# else
   1.176 +/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
   1.177 + * disabled, __m64 is defined as a struct containing "unsigned long long l_".
   1.178 + */
   1.179 +#  define M64_MEMBER l_
   1.180 +# endif
   1.181 +#endif
   1.182 +
   1.183 +#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
   1.184 +typedef uint64_t mmxdatafield;
   1.185 +#else
   1.186 +typedef __m64 mmxdatafield;
   1.187 +#endif
   1.188 +
   1.189 +typedef struct
   1.190 +{
   1.191 +    mmxdatafield mmx_4x00ff;
   1.192 +    mmxdatafield mmx_4x0080;
   1.193 +    mmxdatafield mmx_565_rgb;
   1.194 +    mmxdatafield mmx_565_unpack_multiplier;
   1.195 +    mmxdatafield mmx_565_pack_multiplier;
   1.196 +    mmxdatafield mmx_565_r;
   1.197 +    mmxdatafield mmx_565_g;
   1.198 +    mmxdatafield mmx_565_b;
   1.199 +    mmxdatafield mmx_packed_565_rb;
   1.200 +    mmxdatafield mmx_packed_565_g;
   1.201 +    mmxdatafield mmx_expand_565_g;
   1.202 +    mmxdatafield mmx_expand_565_b;
   1.203 +    mmxdatafield mmx_expand_565_r;
   1.204 +#ifndef USE_LOONGSON_MMI
   1.205 +    mmxdatafield mmx_mask_0;
   1.206 +    mmxdatafield mmx_mask_1;
   1.207 +    mmxdatafield mmx_mask_2;
   1.208 +    mmxdatafield mmx_mask_3;
   1.209 +#endif
   1.210 +    mmxdatafield mmx_full_alpha;
   1.211 +    mmxdatafield mmx_4x0101;
   1.212 +    mmxdatafield mmx_ff000000;
   1.213 +} mmx_data_t;
   1.214 +
   1.215 +#if defined(_MSC_VER)
   1.216 +# define MMXDATA_INIT(field, val) { val ## UI64 }
   1.217 +#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
   1.218 +# define MMXDATA_INIT(field, val) field =   { val ## ULL }
   1.219 +#else                           /* mmxdatafield is an integral type */
   1.220 +# define MMXDATA_INIT(field, val) field =   val ## ULL
   1.221 +#endif
   1.222 +
   1.223 +static const mmx_data_t c =
   1.224 +{
   1.225 +    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
   1.226 +    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
   1.227 +    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
   1.228 +    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
   1.229 +    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
   1.230 +    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
   1.231 +    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
   1.232 +    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
   1.233 +    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
   1.234 +    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
   1.235 +    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
   1.236 +    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
   1.237 +    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
   1.238 +#ifndef USE_LOONGSON_MMI
   1.239 +    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
   1.240 +    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
   1.241 +    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
   1.242 +    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
   1.243 +#endif
   1.244 +    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
   1.245 +    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
   1.246 +    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
   1.247 +};
   1.248 +
   1.249 +#ifdef USE_CVT_INTRINSICS
   1.250 +#    define MC(x) to_m64 (c.mmx_ ## x)
   1.251 +#elif defined(USE_M64_CASTS)
   1.252 +#    define MC(x) ((__m64)c.mmx_ ## x)
   1.253 +#elif defined(USE_M64_DOUBLE)
   1.254 +#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
   1.255 +#else
   1.256 +#    define MC(x) c.mmx_ ## x
   1.257 +#endif
   1.258 +
   1.259 +static force_inline __m64
   1.260 +to_m64 (uint64_t x)
   1.261 +{
   1.262 +#ifdef USE_CVT_INTRINSICS
   1.263 +    return _mm_cvtsi64_m64 (x);
   1.264 +#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
   1.265 +    __m64 res;
   1.266 +
   1.267 +    res.M64_MEMBER = x;
   1.268 +    return res;
   1.269 +#elif defined USE_M64_DOUBLE
   1.270 +    return *(__m64 *)&x;
   1.271 +#else /* USE_M64_CASTS */
   1.272 +    return (__m64)x;
   1.273 +#endif
   1.274 +}
   1.275 +
   1.276 +static force_inline uint64_t
   1.277 +to_uint64 (__m64 x)
   1.278 +{
   1.279 +#ifdef USE_CVT_INTRINSICS
   1.280 +    return _mm_cvtm64_si64 (x);
   1.281 +#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
   1.282 +    uint64_t res = x.M64_MEMBER;
   1.283 +    return res;
   1.284 +#elif defined USE_M64_DOUBLE
   1.285 +    return *(uint64_t *)&x;
   1.286 +#else /* USE_M64_CASTS */
   1.287 +    return (uint64_t)x;
   1.288 +#endif
   1.289 +}
   1.290 +
   1.291 +static force_inline __m64
   1.292 +shift (__m64 v,
   1.293 +       int   s)
   1.294 +{
   1.295 +    if (s > 0)
   1.296 +	return _mm_slli_si64 (v, s);
   1.297 +    else if (s < 0)
   1.298 +	return _mm_srli_si64 (v, -s);
   1.299 +    else
   1.300 +	return v;
   1.301 +}
   1.302 +
   1.303 +static force_inline __m64
   1.304 +negate (__m64 mask)
   1.305 +{
   1.306 +    return _mm_xor_si64 (mask, MC (4x00ff));
   1.307 +}
   1.308 +
   1.309 +static force_inline __m64
   1.310 +pix_multiply (__m64 a, __m64 b)
   1.311 +{
   1.312 +    __m64 res;
   1.313 +
   1.314 +    res = _mm_mullo_pi16 (a, b);
   1.315 +    res = _mm_adds_pu16 (res, MC (4x0080));
   1.316 +    res = _mm_mulhi_pu16 (res, MC (4x0101));
   1.317 +
   1.318 +    return res;
   1.319 +}
   1.320 +
   1.321 +static force_inline __m64
   1.322 +pix_add (__m64 a, __m64 b)
   1.323 +{
   1.324 +    return _mm_adds_pu8 (a, b);
   1.325 +}
   1.326 +
   1.327 +static force_inline __m64
   1.328 +expand_alpha (__m64 pixel)
   1.329 +{
   1.330 +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
   1.331 +}
   1.332 +
   1.333 +static force_inline __m64
   1.334 +expand_alpha_rev (__m64 pixel)
   1.335 +{
   1.336 +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
   1.337 +}
   1.338 +
   1.339 +static force_inline __m64
   1.340 +invert_colors (__m64 pixel)
   1.341 +{
   1.342 +    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
   1.343 +}
   1.344 +
   1.345 +static force_inline __m64
   1.346 +over (__m64 src,
   1.347 +      __m64 srca,
   1.348 +      __m64 dest)
   1.349 +{
   1.350 +    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
   1.351 +}
   1.352 +
   1.353 +static force_inline __m64
   1.354 +over_rev_non_pre (__m64 src, __m64 dest)
   1.355 +{
   1.356 +    __m64 srca = expand_alpha (src);
   1.357 +    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
   1.358 +
   1.359 +    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
   1.360 +}
   1.361 +
   1.362 +static force_inline __m64
   1.363 +in (__m64 src, __m64 mask)
   1.364 +{
   1.365 +    return pix_multiply (src, mask);
   1.366 +}
   1.367 +
   1.368 +#ifndef _MSC_VER
   1.369 +static force_inline __m64
   1.370 +in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
   1.371 +{
   1.372 +    return over (in (src, mask), pix_multiply (srca, mask), dest);
   1.373 +}
   1.374 +
   1.375 +#else
   1.376 +
   1.377 +#define in_over(src, srca, mask, dest)					\
   1.378 +    over (in (src, mask), pix_multiply (srca, mask), dest)
   1.379 +
   1.380 +#endif
   1.381 +
   1.382 +/* Elemental unaligned loads */
   1.383 +
   1.384 +static force_inline __m64 ldq_u(__m64 *p)
   1.385 +{
   1.386 +#ifdef USE_X86_MMX
   1.387 +    /* x86's alignment restrictions are very relaxed. */
   1.388 +    return *(__m64 *)p;
   1.389 +#elif defined USE_ARM_IWMMXT
   1.390 +    int align = (uintptr_t)p & 7;
   1.391 +    __m64 *aligned_p;
   1.392 +    if (align == 0)
   1.393 +	return *p;
   1.394 +    aligned_p = (__m64 *)((uintptr_t)p & ~7);
   1.395 +    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
   1.396 +#else
   1.397 +    struct __una_u64 { __m64 x __attribute__((packed)); };
   1.398 +    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
   1.399 +    return (__m64) ptr->x;
   1.400 +#endif
   1.401 +}
   1.402 +
   1.403 +static force_inline uint32_t ldl_u(const uint32_t *p)
   1.404 +{
   1.405 +#ifdef USE_X86_MMX
   1.406 +    /* x86's alignment restrictions are very relaxed. */
   1.407 +    return *p;
   1.408 +#else
   1.409 +    struct __una_u32 { uint32_t x __attribute__((packed)); };
   1.410 +    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
   1.411 +    return ptr->x;
   1.412 +#endif
   1.413 +}
   1.414 +
   1.415 +static force_inline __m64
   1.416 +load (const uint32_t *v)
   1.417 +{
   1.418 +#ifdef USE_LOONGSON_MMI
   1.419 +    __m64 ret;
   1.420 +    asm ("lwc1 %0, %1\n\t"
   1.421 +	: "=f" (ret)
   1.422 +	: "m" (*v)
   1.423 +    );
   1.424 +    return ret;
   1.425 +#else
   1.426 +    return _mm_cvtsi32_si64 (*v);
   1.427 +#endif
   1.428 +}
   1.429 +
   1.430 +static force_inline __m64
   1.431 +load8888 (const uint32_t *v)
   1.432 +{
   1.433 +#ifdef USE_LOONGSON_MMI
   1.434 +    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
   1.435 +#else
   1.436 +    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
   1.437 +#endif
   1.438 +}
   1.439 +
   1.440 +static force_inline __m64
   1.441 +load8888u (const uint32_t *v)
   1.442 +{
   1.443 +    uint32_t l = ldl_u (v);
   1.444 +    return load8888 (&l);
   1.445 +}
   1.446 +
   1.447 +static force_inline __m64
   1.448 +pack8888 (__m64 lo, __m64 hi)
   1.449 +{
   1.450 +    return _mm_packs_pu16 (lo, hi);
   1.451 +}
   1.452 +
   1.453 +static force_inline void
   1.454 +store (uint32_t *dest, __m64 v)
   1.455 +{
   1.456 +#ifdef USE_LOONGSON_MMI
   1.457 +    asm ("swc1 %1, %0\n\t"
   1.458 +	: "=m" (*dest)
   1.459 +	: "f" (v)
   1.460 +	: "memory"
   1.461 +    );
   1.462 +#else
   1.463 +    *dest = _mm_cvtsi64_si32 (v);
   1.464 +#endif
   1.465 +}
   1.466 +
   1.467 +static force_inline void
   1.468 +store8888 (uint32_t *dest, __m64 v)
   1.469 +{
   1.470 +    v = pack8888 (v, _mm_setzero_si64 ());
   1.471 +    store (dest, v);
   1.472 +}
   1.473 +
   1.474 +static force_inline pixman_bool_t
   1.475 +is_equal (__m64 a, __m64 b)
   1.476 +{
   1.477 +#ifdef USE_LOONGSON_MMI
   1.478 +    /* __m64 is double, we can compare directly. */
   1.479 +    return a == b;
   1.480 +#else
   1.481 +    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
   1.482 +#endif
   1.483 +}
   1.484 +
   1.485 +static force_inline pixman_bool_t
   1.486 +is_opaque (__m64 v)
   1.487 +{
   1.488 +#ifdef USE_LOONGSON_MMI
   1.489 +    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
   1.490 +#else
   1.491 +    __m64 ffs = _mm_cmpeq_pi8 (v, v);
   1.492 +    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
   1.493 +#endif
   1.494 +}
   1.495 +
   1.496 +static force_inline pixman_bool_t
   1.497 +is_zero (__m64 v)
   1.498 +{
   1.499 +    return is_equal (v, _mm_setzero_si64 ());
   1.500 +}
   1.501 +
   1.502 +/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
   1.503 + *
   1.504 + *    00RR00GG00BB
   1.505 + *
   1.506 + * --- Expanding 565 in the low word ---
   1.507 + *
   1.508 + * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
   1.509 + * m = m & (01f0003f001f);
   1.510 + * m = m * (008404100840);
   1.511 + * m = m >> 8;
   1.512 + *
   1.513 + * Note the trick here - the top word is shifted by another nibble to
   1.514 + * avoid it bumping into the middle word
   1.515 + */
   1.516 +static force_inline __m64
   1.517 +expand565 (__m64 pixel, int pos)
   1.518 +{
   1.519 +    __m64 p = pixel;
   1.520 +    __m64 t1, t2;
   1.521 +
   1.522 +    /* move pixel to low 16 bit and zero the rest */
   1.523 +#ifdef USE_LOONGSON_MMI
   1.524 +    p = loongson_extract_pi16 (p, pos);
   1.525 +#else
   1.526 +    p = shift (shift (p, (3 - pos) * 16), -48);
   1.527 +#endif
   1.528 +
   1.529 +    t1 = shift (p, 36 - 11);
   1.530 +    t2 = shift (p, 16 - 5);
   1.531 +
   1.532 +    p = _mm_or_si64 (t1, p);
   1.533 +    p = _mm_or_si64 (t2, p);
   1.534 +    p = _mm_and_si64 (p, MC (565_rgb));
   1.535 +
   1.536 +    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
   1.537 +    return _mm_srli_pi16 (pixel, 8);
   1.538 +}
   1.539 +
   1.540 +/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
   1.541 + *
   1.542 + *    AARRGGBBRRGGBB
   1.543 + */
   1.544 +static force_inline void
   1.545 +expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
   1.546 +{
   1.547 +    __m64 t0, t1, alpha = _mm_setzero_si64 ();
   1.548 +    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
   1.549 +    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
   1.550 +    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
   1.551 +    if (full_alpha)
   1.552 +	alpha = _mm_cmpeq_pi32 (alpha, alpha);
   1.553 +
   1.554 +    /* Replicate high bits into empty low bits. */
   1.555 +    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
   1.556 +    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
   1.557 +    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
   1.558 +
   1.559 +    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
   1.560 +    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
   1.561 +    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
   1.562 +
   1.563 +    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
   1.564 +    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
   1.565 +
   1.566 +    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
   1.567 +    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
   1.568 +}
   1.569 +
   1.570 +static force_inline __m64
   1.571 +expand8888 (__m64 in, int pos)
   1.572 +{
   1.573 +    if (pos == 0)
   1.574 +	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
   1.575 +    else
   1.576 +	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
   1.577 +}
   1.578 +
   1.579 +static force_inline __m64
   1.580 +expandx888 (__m64 in, int pos)
   1.581 +{
   1.582 +    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
   1.583 +}
   1.584 +
   1.585 +static force_inline void
   1.586 +expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
   1.587 +{
   1.588 +    __m64 v0, v1;
   1.589 +    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
   1.590 +    *vout0 = expand8888 (v0, 0);
   1.591 +    *vout1 = expand8888 (v0, 1);
   1.592 +    *vout2 = expand8888 (v1, 0);
   1.593 +    *vout3 = expand8888 (v1, 1);
   1.594 +}
   1.595 +
   1.596 +static force_inline __m64
   1.597 +pack_565 (__m64 pixel, __m64 target, int pos)
   1.598 +{
   1.599 +    __m64 p = pixel;
   1.600 +    __m64 t = target;
   1.601 +    __m64 r, g, b;
   1.602 +
   1.603 +    r = _mm_and_si64 (p, MC (565_r));
   1.604 +    g = _mm_and_si64 (p, MC (565_g));
   1.605 +    b = _mm_and_si64 (p, MC (565_b));
   1.606 +
   1.607 +#ifdef USE_LOONGSON_MMI
   1.608 +    r = shift (r, -(32 - 8));
   1.609 +    g = shift (g, -(16 - 3));
   1.610 +    b = shift (b, -(0  + 3));
   1.611 +
   1.612 +    p = _mm_or_si64 (r, g);
   1.613 +    p = _mm_or_si64 (p, b);
   1.614 +    return loongson_insert_pi16 (t, p, pos);
   1.615 +#else
   1.616 +    r = shift (r, -(32 - 8) + pos * 16);
   1.617 +    g = shift (g, -(16 - 3) + pos * 16);
   1.618 +    b = shift (b, -(0  + 3) + pos * 16);
   1.619 +
   1.620 +    if (pos == 0)
   1.621 +	t = _mm_and_si64 (t, MC (mask_0));
   1.622 +    else if (pos == 1)
   1.623 +	t = _mm_and_si64 (t, MC (mask_1));
   1.624 +    else if (pos == 2)
   1.625 +	t = _mm_and_si64 (t, MC (mask_2));
   1.626 +    else if (pos == 3)
   1.627 +	t = _mm_and_si64 (t, MC (mask_3));
   1.628 +
   1.629 +    p = _mm_or_si64 (r, t);
   1.630 +    p = _mm_or_si64 (g, p);
   1.631 +
   1.632 +    return _mm_or_si64 (b, p);
   1.633 +#endif
   1.634 +}
   1.635 +
   1.636 +static force_inline __m64
   1.637 +pack_4xpacked565 (__m64 a, __m64 b)
   1.638 +{
   1.639 +    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
   1.640 +    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
   1.641 +
   1.642 +    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
   1.643 +    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
   1.644 +
   1.645 +    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
   1.646 +    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
   1.647 +
   1.648 +    t0 = _mm_or_si64 (t0, g0);
   1.649 +    t1 = _mm_or_si64 (t1, g1);
   1.650 +
   1.651 +    t0 = shift(t0, -5);
   1.652 +#ifdef USE_ARM_IWMMXT
   1.653 +    t1 = shift(t1, -5);
   1.654 +    return _mm_packs_pu32 (t0, t1);
   1.655 +#else
   1.656 +    t1 = shift(t1, -5 + 16);
   1.657 +    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
   1.658 +#endif
   1.659 +}
   1.660 +
   1.661 +#ifndef _MSC_VER
   1.662 +
   1.663 +static force_inline __m64
   1.664 +pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
   1.665 +{
   1.666 +    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
   1.667 +}
   1.668 +
   1.669 +static force_inline __m64
   1.670 +pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
   1.671 +{
   1.672 +    x = pix_multiply (x, a);
   1.673 +    y = pix_multiply (y, b);
   1.674 +
   1.675 +    return pix_add (x, y);
   1.676 +}
   1.677 +
   1.678 +#else
   1.679 +
   1.680 +/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
   1.681 +
   1.682 +#define pack_4x565(v0, v1, v2, v3) \
   1.683 +    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
   1.684 +
   1.685 +#define pix_add_mul(x, a, y, b)	 \
   1.686 +    ( x = pix_multiply (x, a),	 \
   1.687 +      y = pix_multiply (y, b),	 \
   1.688 +      pix_add (x, y) )
   1.689 +
   1.690 +#endif
   1.691 +
   1.692 +/* --------------- MMX code patch for fbcompose.c --------------------- */
   1.693 +
   1.694 +static force_inline __m64
   1.695 +combine (const uint32_t *src, const uint32_t *mask)
   1.696 +{
   1.697 +    __m64 vsrc = load8888 (src);
   1.698 +
   1.699 +    if (mask)
   1.700 +    {
   1.701 +	__m64 m = load8888 (mask);
   1.702 +
   1.703 +	m = expand_alpha (m);
   1.704 +	vsrc = pix_multiply (vsrc, m);
   1.705 +    }
   1.706 +
   1.707 +    return vsrc;
   1.708 +}
   1.709 +
   1.710 +static force_inline __m64
   1.711 +core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
   1.712 +{
   1.713 +    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
   1.714 +
   1.715 +    if (is_opaque (vsrc))
   1.716 +    {
   1.717 +	return vsrc;
   1.718 +    }
   1.719 +    else if (!is_zero (vsrc))
   1.720 +    {
   1.721 +	return over (vsrc, expand_alpha (vsrc),
   1.722 +		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
   1.723 +    }
   1.724 +
   1.725 +    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
   1.726 +}
   1.727 +
   1.728 +static void
   1.729 +mmx_combine_over_u (pixman_implementation_t *imp,
   1.730 +                    pixman_op_t              op,
   1.731 +                    uint32_t *               dest,
   1.732 +                    const uint32_t *         src,
   1.733 +                    const uint32_t *         mask,
   1.734 +                    int                      width)
   1.735 +{
   1.736 +    const uint32_t *end = dest + width;
   1.737 +
   1.738 +    while (dest < end)
   1.739 +    {
   1.740 +	__m64 vsrc = combine (src, mask);
   1.741 +
   1.742 +	if (is_opaque (vsrc))
   1.743 +	{
   1.744 +	    store8888 (dest, vsrc);
   1.745 +	}
   1.746 +	else if (!is_zero (vsrc))
   1.747 +	{
   1.748 +	    __m64 sa = expand_alpha (vsrc);
   1.749 +	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
   1.750 +	}
   1.751 +
   1.752 +	++dest;
   1.753 +	++src;
   1.754 +	if (mask)
   1.755 +	    ++mask;
   1.756 +    }
   1.757 +    _mm_empty ();
   1.758 +}
   1.759 +
   1.760 +static void
   1.761 +mmx_combine_over_reverse_u (pixman_implementation_t *imp,
   1.762 +                            pixman_op_t              op,
   1.763 +                            uint32_t *               dest,
   1.764 +                            const uint32_t *         src,
   1.765 +                            const uint32_t *         mask,
   1.766 +                            int                      width)
   1.767 +{
   1.768 +    const uint32_t *end = dest + width;
   1.769 +
   1.770 +    while (dest < end)
   1.771 +    {
   1.772 +	__m64 d, da;
   1.773 +	__m64 s = combine (src, mask);
   1.774 +
   1.775 +	d = load8888 (dest);
   1.776 +	da = expand_alpha (d);
   1.777 +	store8888 (dest, over (d, da, s));
   1.778 +
   1.779 +	++dest;
   1.780 +	++src;
   1.781 +	if (mask)
   1.782 +	    mask++;
   1.783 +    }
   1.784 +    _mm_empty ();
   1.785 +}
   1.786 +
   1.787 +static void
   1.788 +mmx_combine_in_u (pixman_implementation_t *imp,
   1.789 +                  pixman_op_t              op,
   1.790 +                  uint32_t *               dest,
   1.791 +                  const uint32_t *         src,
   1.792 +                  const uint32_t *         mask,
   1.793 +                  int                      width)
   1.794 +{
   1.795 +    const uint32_t *end = dest + width;
   1.796 +
   1.797 +    while (dest < end)
   1.798 +    {
   1.799 +	__m64 a;
   1.800 +	__m64 x = combine (src, mask);
   1.801 +
   1.802 +	a = load8888 (dest);
   1.803 +	a = expand_alpha (a);
   1.804 +	x = pix_multiply (x, a);
   1.805 +
   1.806 +	store8888 (dest, x);
   1.807 +
   1.808 +	++dest;
   1.809 +	++src;
   1.810 +	if (mask)
   1.811 +	    mask++;
   1.812 +    }
   1.813 +    _mm_empty ();
   1.814 +}
   1.815 +
   1.816 +static void
   1.817 +mmx_combine_in_reverse_u (pixman_implementation_t *imp,
   1.818 +                          pixman_op_t              op,
   1.819 +                          uint32_t *               dest,
   1.820 +                          const uint32_t *         src,
   1.821 +                          const uint32_t *         mask,
   1.822 +                          int                      width)
   1.823 +{
   1.824 +    const uint32_t *end = dest + width;
   1.825 +
   1.826 +    while (dest < end)
   1.827 +    {
   1.828 +	__m64 a = combine (src, mask);
   1.829 +	__m64 x;
   1.830 +
   1.831 +	x = load8888 (dest);
   1.832 +	a = expand_alpha (a);
   1.833 +	x = pix_multiply (x, a);
   1.834 +	store8888 (dest, x);
   1.835 +
   1.836 +	++dest;
   1.837 +	++src;
   1.838 +	if (mask)
   1.839 +	    mask++;
   1.840 +    }
   1.841 +    _mm_empty ();
   1.842 +}
   1.843 +
   1.844 +static void
   1.845 +mmx_combine_out_u (pixman_implementation_t *imp,
   1.846 +                   pixman_op_t              op,
   1.847 +                   uint32_t *               dest,
   1.848 +                   const uint32_t *         src,
   1.849 +                   const uint32_t *         mask,
   1.850 +                   int                      width)
   1.851 +{
   1.852 +    const uint32_t *end = dest + width;
   1.853 +
   1.854 +    while (dest < end)
   1.855 +    {
   1.856 +	__m64 a;
   1.857 +	__m64 x = combine (src, mask);
   1.858 +
   1.859 +	a = load8888 (dest);
   1.860 +	a = expand_alpha (a);
   1.861 +	a = negate (a);
   1.862 +	x = pix_multiply (x, a);
   1.863 +	store8888 (dest, x);
   1.864 +
   1.865 +	++dest;
   1.866 +	++src;
   1.867 +	if (mask)
   1.868 +	    mask++;
   1.869 +    }
   1.870 +    _mm_empty ();
   1.871 +}
   1.872 +
   1.873 +static void
   1.874 +mmx_combine_out_reverse_u (pixman_implementation_t *imp,
   1.875 +                           pixman_op_t              op,
   1.876 +                           uint32_t *               dest,
   1.877 +                           const uint32_t *         src,
   1.878 +                           const uint32_t *         mask,
   1.879 +                           int                      width)
   1.880 +{
   1.881 +    const uint32_t *end = dest + width;
   1.882 +
   1.883 +    while (dest < end)
   1.884 +    {
   1.885 +	__m64 a = combine (src, mask);
   1.886 +	__m64 x;
   1.887 +
   1.888 +	x = load8888 (dest);
   1.889 +	a = expand_alpha (a);
   1.890 +	a = negate (a);
   1.891 +	x = pix_multiply (x, a);
   1.892 +
   1.893 +	store8888 (dest, x);
   1.894 +
   1.895 +	++dest;
   1.896 +	++src;
   1.897 +	if (mask)
   1.898 +	    mask++;
   1.899 +    }
   1.900 +    _mm_empty ();
   1.901 +}
   1.902 +
   1.903 +static void
   1.904 +mmx_combine_atop_u (pixman_implementation_t *imp,
   1.905 +                    pixman_op_t              op,
   1.906 +                    uint32_t *               dest,
   1.907 +                    const uint32_t *         src,
   1.908 +                    const uint32_t *         mask,
   1.909 +                    int                      width)
   1.910 +{
   1.911 +    const uint32_t *end = dest + width;
   1.912 +
   1.913 +    while (dest < end)
   1.914 +    {
   1.915 +	__m64 da, d, sia;
   1.916 +	__m64 s = combine (src, mask);
   1.917 +
   1.918 +	d = load8888 (dest);
   1.919 +	sia = expand_alpha (s);
   1.920 +	sia = negate (sia);
   1.921 +	da = expand_alpha (d);
   1.922 +	s = pix_add_mul (s, da, d, sia);
   1.923 +	store8888 (dest, s);
   1.924 +
   1.925 +	++dest;
   1.926 +	++src;
   1.927 +	if (mask)
   1.928 +	    mask++;
   1.929 +    }
   1.930 +    _mm_empty ();
   1.931 +}
   1.932 +
   1.933 +static void
   1.934 +mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
   1.935 +                            pixman_op_t              op,
   1.936 +                            uint32_t *               dest,
   1.937 +                            const uint32_t *         src,
   1.938 +                            const uint32_t *         mask,
   1.939 +                            int                      width)
   1.940 +{
   1.941 +    const uint32_t *end;
   1.942 +
   1.943 +    end = dest + width;
   1.944 +
   1.945 +    while (dest < end)
   1.946 +    {
   1.947 +	__m64 dia, d, sa;
   1.948 +	__m64 s = combine (src, mask);
   1.949 +
   1.950 +	d = load8888 (dest);
   1.951 +	sa = expand_alpha (s);
   1.952 +	dia = expand_alpha (d);
   1.953 +	dia = negate (dia);
   1.954 +	s = pix_add_mul (s, dia, d, sa);
   1.955 +	store8888 (dest, s);
   1.956 +
   1.957 +	++dest;
   1.958 +	++src;
   1.959 +	if (mask)
   1.960 +	    mask++;
   1.961 +    }
   1.962 +    _mm_empty ();
   1.963 +}
   1.964 +
   1.965 +static void
   1.966 +mmx_combine_xor_u (pixman_implementation_t *imp,
   1.967 +                   pixman_op_t              op,
   1.968 +                   uint32_t *               dest,
   1.969 +                   const uint32_t *         src,
   1.970 +                   const uint32_t *         mask,
   1.971 +                   int                      width)
   1.972 +{
   1.973 +    const uint32_t *end = dest + width;
   1.974 +
   1.975 +    while (dest < end)
   1.976 +    {
   1.977 +	__m64 dia, d, sia;
   1.978 +	__m64 s = combine (src, mask);
   1.979 +
   1.980 +	d = load8888 (dest);
   1.981 +	sia = expand_alpha (s);
   1.982 +	dia = expand_alpha (d);
   1.983 +	sia = negate (sia);
   1.984 +	dia = negate (dia);
   1.985 +	s = pix_add_mul (s, dia, d, sia);
   1.986 +	store8888 (dest, s);
   1.987 +
   1.988 +	++dest;
   1.989 +	++src;
   1.990 +	if (mask)
   1.991 +	    mask++;
   1.992 +    }
   1.993 +    _mm_empty ();
   1.994 +}
   1.995 +
   1.996 +static void
   1.997 +mmx_combine_add_u (pixman_implementation_t *imp,
   1.998 +                   pixman_op_t              op,
   1.999 +                   uint32_t *               dest,
  1.1000 +                   const uint32_t *         src,
  1.1001 +                   const uint32_t *         mask,
  1.1002 +                   int                      width)
  1.1003 +{
  1.1004 +    const uint32_t *end = dest + width;
  1.1005 +
  1.1006 +    while (dest < end)
  1.1007 +    {
  1.1008 +	__m64 d;
  1.1009 +	__m64 s = combine (src, mask);
  1.1010 +
  1.1011 +	d = load8888 (dest);
  1.1012 +	s = pix_add (s, d);
  1.1013 +	store8888 (dest, s);
  1.1014 +
  1.1015 +	++dest;
  1.1016 +	++src;
  1.1017 +	if (mask)
  1.1018 +	    mask++;
  1.1019 +    }
  1.1020 +    _mm_empty ();
  1.1021 +}
  1.1022 +
  1.1023 +static void
  1.1024 +mmx_combine_saturate_u (pixman_implementation_t *imp,
  1.1025 +                        pixman_op_t              op,
  1.1026 +                        uint32_t *               dest,
  1.1027 +                        const uint32_t *         src,
  1.1028 +                        const uint32_t *         mask,
  1.1029 +                        int                      width)
  1.1030 +{
  1.1031 +    const uint32_t *end = dest + width;
  1.1032 +
  1.1033 +    while (dest < end)
  1.1034 +    {
  1.1035 +	uint32_t s, sa, da;
  1.1036 +	uint32_t d = *dest;
  1.1037 +	__m64 ms = combine (src, mask);
  1.1038 +	__m64 md = load8888 (dest);
  1.1039 +
  1.1040 +	store8888(&s, ms);
  1.1041 +	da = ~d >> 24;
  1.1042 +	sa = s >> 24;
  1.1043 +
  1.1044 +	if (sa > da)
  1.1045 +	{
  1.1046 +	    uint32_t quot = DIV_UN8 (da, sa) << 24;
  1.1047 +	    __m64 msa = load8888 (&quot);
  1.1048 +	    msa = expand_alpha (msa);
  1.1049 +	    ms = pix_multiply (ms, msa);
  1.1050 +	}
  1.1051 +
  1.1052 +	md = pix_add (md, ms);
  1.1053 +	store8888 (dest, md);
  1.1054 +
  1.1055 +	++src;
  1.1056 +	++dest;
  1.1057 +	if (mask)
  1.1058 +	    mask++;
  1.1059 +    }
  1.1060 +    _mm_empty ();
  1.1061 +}
  1.1062 +
  1.1063 +static void
  1.1064 +mmx_combine_src_ca (pixman_implementation_t *imp,
  1.1065 +                    pixman_op_t              op,
  1.1066 +                    uint32_t *               dest,
  1.1067 +                    const uint32_t *         src,
  1.1068 +                    const uint32_t *         mask,
  1.1069 +                    int                      width)
  1.1070 +{
  1.1071 +    const uint32_t *end = src + width;
  1.1072 +
  1.1073 +    while (src < end)
  1.1074 +    {
  1.1075 +	__m64 a = load8888 (mask);
  1.1076 +	__m64 s = load8888 (src);
  1.1077 +
  1.1078 +	s = pix_multiply (s, a);
  1.1079 +	store8888 (dest, s);
  1.1080 +
  1.1081 +	++src;
  1.1082 +	++mask;
  1.1083 +	++dest;
  1.1084 +    }
  1.1085 +    _mm_empty ();
  1.1086 +}
  1.1087 +
  1.1088 +static void
  1.1089 +mmx_combine_over_ca (pixman_implementation_t *imp,
  1.1090 +                     pixman_op_t              op,
  1.1091 +                     uint32_t *               dest,
  1.1092 +                     const uint32_t *         src,
  1.1093 +                     const uint32_t *         mask,
  1.1094 +                     int                      width)
  1.1095 +{
  1.1096 +    const uint32_t *end = src + width;
  1.1097 +
  1.1098 +    while (src < end)
  1.1099 +    {
  1.1100 +	__m64 a = load8888 (mask);
  1.1101 +	__m64 s = load8888 (src);
  1.1102 +	__m64 d = load8888 (dest);
  1.1103 +	__m64 sa = expand_alpha (s);
  1.1104 +
  1.1105 +	store8888 (dest, in_over (s, sa, a, d));
  1.1106 +
  1.1107 +	++src;
  1.1108 +	++dest;
  1.1109 +	++mask;
  1.1110 +    }
  1.1111 +    _mm_empty ();
  1.1112 +}
  1.1113 +
  1.1114 +static void
  1.1115 +mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
  1.1116 +                             pixman_op_t              op,
  1.1117 +                             uint32_t *               dest,
  1.1118 +                             const uint32_t *         src,
  1.1119 +                             const uint32_t *         mask,
  1.1120 +                             int                      width)
  1.1121 +{
  1.1122 +    const uint32_t *end = src + width;
  1.1123 +
  1.1124 +    while (src < end)
  1.1125 +    {
  1.1126 +	__m64 a = load8888 (mask);
  1.1127 +	__m64 s = load8888 (src);
  1.1128 +	__m64 d = load8888 (dest);
  1.1129 +	__m64 da = expand_alpha (d);
  1.1130 +
  1.1131 +	store8888 (dest, over (d, da, in (s, a)));
  1.1132 +
  1.1133 +	++src;
  1.1134 +	++dest;
  1.1135 +	++mask;
  1.1136 +    }
  1.1137 +    _mm_empty ();
  1.1138 +}
  1.1139 +
  1.1140 +static void
  1.1141 +mmx_combine_in_ca (pixman_implementation_t *imp,
  1.1142 +                   pixman_op_t              op,
  1.1143 +                   uint32_t *               dest,
  1.1144 +                   const uint32_t *         src,
  1.1145 +                   const uint32_t *         mask,
  1.1146 +                   int                      width)
  1.1147 +{
  1.1148 +    const uint32_t *end = src + width;
  1.1149 +
  1.1150 +    while (src < end)
  1.1151 +    {
  1.1152 +	__m64 a = load8888 (mask);
  1.1153 +	__m64 s = load8888 (src);
  1.1154 +	__m64 d = load8888 (dest);
  1.1155 +	__m64 da = expand_alpha (d);
  1.1156 +
  1.1157 +	s = pix_multiply (s, a);
  1.1158 +	s = pix_multiply (s, da);
  1.1159 +	store8888 (dest, s);
  1.1160 +
  1.1161 +	++src;
  1.1162 +	++dest;
  1.1163 +	++mask;
  1.1164 +    }
  1.1165 +    _mm_empty ();
  1.1166 +}
  1.1167 +
  1.1168 +static void
  1.1169 +mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
  1.1170 +                           pixman_op_t              op,
  1.1171 +                           uint32_t *               dest,
  1.1172 +                           const uint32_t *         src,
  1.1173 +                           const uint32_t *         mask,
  1.1174 +                           int                      width)
  1.1175 +{
  1.1176 +    const uint32_t *end = src + width;
  1.1177 +
  1.1178 +    while (src < end)
  1.1179 +    {
  1.1180 +	__m64 a = load8888 (mask);
  1.1181 +	__m64 s = load8888 (src);
  1.1182 +	__m64 d = load8888 (dest);
  1.1183 +	__m64 sa = expand_alpha (s);
  1.1184 +
  1.1185 +	a = pix_multiply (a, sa);
  1.1186 +	d = pix_multiply (d, a);
  1.1187 +	store8888 (dest, d);
  1.1188 +
  1.1189 +	++src;
  1.1190 +	++dest;
  1.1191 +	++mask;
  1.1192 +    }
  1.1193 +    _mm_empty ();
  1.1194 +}
  1.1195 +
  1.1196 +static void
  1.1197 +mmx_combine_out_ca (pixman_implementation_t *imp,
  1.1198 +                    pixman_op_t              op,
  1.1199 +                    uint32_t *               dest,
  1.1200 +                    const uint32_t *         src,
  1.1201 +                    const uint32_t *         mask,
  1.1202 +                    int                      width)
  1.1203 +{
  1.1204 +    const uint32_t *end = src + width;
  1.1205 +
  1.1206 +    while (src < end)
  1.1207 +    {
  1.1208 +	__m64 a = load8888 (mask);
  1.1209 +	__m64 s = load8888 (src);
  1.1210 +	__m64 d = load8888 (dest);
  1.1211 +	__m64 da = expand_alpha (d);
  1.1212 +
  1.1213 +	da = negate (da);
  1.1214 +	s = pix_multiply (s, a);
  1.1215 +	s = pix_multiply (s, da);
  1.1216 +	store8888 (dest, s);
  1.1217 +
  1.1218 +	++src;
  1.1219 +	++dest;
  1.1220 +	++mask;
  1.1221 +    }
  1.1222 +    _mm_empty ();
  1.1223 +}
  1.1224 +
  1.1225 +static void
  1.1226 +mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
  1.1227 +                            pixman_op_t              op,
  1.1228 +                            uint32_t *               dest,
  1.1229 +                            const uint32_t *         src,
  1.1230 +                            const uint32_t *         mask,
  1.1231 +                            int                      width)
  1.1232 +{
  1.1233 +    const uint32_t *end = src + width;
  1.1234 +
  1.1235 +    while (src < end)
  1.1236 +    {
  1.1237 +	__m64 a = load8888 (mask);
  1.1238 +	__m64 s = load8888 (src);
  1.1239 +	__m64 d = load8888 (dest);
  1.1240 +	__m64 sa = expand_alpha (s);
  1.1241 +
  1.1242 +	a = pix_multiply (a, sa);
  1.1243 +	a = negate (a);
  1.1244 +	d = pix_multiply (d, a);
  1.1245 +	store8888 (dest, d);
  1.1246 +
  1.1247 +	++src;
  1.1248 +	++dest;
  1.1249 +	++mask;
  1.1250 +    }
  1.1251 +    _mm_empty ();
  1.1252 +}
  1.1253 +
  1.1254 +static void
  1.1255 +mmx_combine_atop_ca (pixman_implementation_t *imp,
  1.1256 +                     pixman_op_t              op,
  1.1257 +                     uint32_t *               dest,
  1.1258 +                     const uint32_t *         src,
  1.1259 +                     const uint32_t *         mask,
  1.1260 +                     int                      width)
  1.1261 +{
  1.1262 +    const uint32_t *end = src + width;
  1.1263 +
  1.1264 +    while (src < end)
  1.1265 +    {
  1.1266 +	__m64 a = load8888 (mask);
  1.1267 +	__m64 s = load8888 (src);
  1.1268 +	__m64 d = load8888 (dest);
  1.1269 +	__m64 da = expand_alpha (d);
  1.1270 +	__m64 sa = expand_alpha (s);
  1.1271 +
  1.1272 +	s = pix_multiply (s, a);
  1.1273 +	a = pix_multiply (a, sa);
  1.1274 +	a = negate (a);
  1.1275 +	d = pix_add_mul (d, a, s, da);
  1.1276 +	store8888 (dest, d);
  1.1277 +
  1.1278 +	++src;
  1.1279 +	++dest;
  1.1280 +	++mask;
  1.1281 +    }
  1.1282 +    _mm_empty ();
  1.1283 +}
  1.1284 +
  1.1285 +static void
  1.1286 +mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
  1.1287 +                             pixman_op_t              op,
  1.1288 +                             uint32_t *               dest,
  1.1289 +                             const uint32_t *         src,
  1.1290 +                             const uint32_t *         mask,
  1.1291 +                             int                      width)
  1.1292 +{
  1.1293 +    const uint32_t *end = src + width;
  1.1294 +
  1.1295 +    while (src < end)
  1.1296 +    {
  1.1297 +	__m64 a = load8888 (mask);
  1.1298 +	__m64 s = load8888 (src);
  1.1299 +	__m64 d = load8888 (dest);
  1.1300 +	__m64 da = expand_alpha (d);
  1.1301 +	__m64 sa = expand_alpha (s);
  1.1302 +
  1.1303 +	s = pix_multiply (s, a);
  1.1304 +	a = pix_multiply (a, sa);
  1.1305 +	da = negate (da);
  1.1306 +	d = pix_add_mul (d, a, s, da);
  1.1307 +	store8888 (dest, d);
  1.1308 +
  1.1309 +	++src;
  1.1310 +	++dest;
  1.1311 +	++mask;
  1.1312 +    }
  1.1313 +    _mm_empty ();
  1.1314 +}
  1.1315 +
  1.1316 +static void
  1.1317 +mmx_combine_xor_ca (pixman_implementation_t *imp,
  1.1318 +                    pixman_op_t              op,
  1.1319 +                    uint32_t *               dest,
  1.1320 +                    const uint32_t *         src,
  1.1321 +                    const uint32_t *         mask,
  1.1322 +                    int                      width)
  1.1323 +{
  1.1324 +    const uint32_t *end = src + width;
  1.1325 +
  1.1326 +    while (src < end)
  1.1327 +    {
  1.1328 +	__m64 a = load8888 (mask);
  1.1329 +	__m64 s = load8888 (src);
  1.1330 +	__m64 d = load8888 (dest);
  1.1331 +	__m64 da = expand_alpha (d);
  1.1332 +	__m64 sa = expand_alpha (s);
  1.1333 +
  1.1334 +	s = pix_multiply (s, a);
  1.1335 +	a = pix_multiply (a, sa);
  1.1336 +	da = negate (da);
  1.1337 +	a = negate (a);
  1.1338 +	d = pix_add_mul (d, a, s, da);
  1.1339 +	store8888 (dest, d);
  1.1340 +
  1.1341 +	++src;
  1.1342 +	++dest;
  1.1343 +	++mask;
  1.1344 +    }
  1.1345 +    _mm_empty ();
  1.1346 +}
  1.1347 +
  1.1348 +static void
  1.1349 +mmx_combine_add_ca (pixman_implementation_t *imp,
  1.1350 +                    pixman_op_t              op,
  1.1351 +                    uint32_t *               dest,
  1.1352 +                    const uint32_t *         src,
  1.1353 +                    const uint32_t *         mask,
  1.1354 +                    int                      width)
  1.1355 +{
  1.1356 +    const uint32_t *end = src + width;
  1.1357 +
  1.1358 +    while (src < end)
  1.1359 +    {
  1.1360 +	__m64 a = load8888 (mask);
  1.1361 +	__m64 s = load8888 (src);
  1.1362 +	__m64 d = load8888 (dest);
  1.1363 +
  1.1364 +	s = pix_multiply (s, a);
  1.1365 +	d = pix_add (s, d);
  1.1366 +	store8888 (dest, d);
  1.1367 +
  1.1368 +	++src;
  1.1369 +	++dest;
  1.1370 +	++mask;
  1.1371 +    }
  1.1372 +    _mm_empty ();
  1.1373 +}
  1.1374 +
  1.1375 +/* ------------- MMX code paths called from fbpict.c -------------------- */
  1.1376 +
  1.1377 +static void
  1.1378 +mmx_composite_over_n_8888 (pixman_implementation_t *imp,
  1.1379 +                           pixman_composite_info_t *info)
  1.1380 +{
  1.1381 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1382 +    uint32_t src;
  1.1383 +    uint32_t    *dst_line, *dst;
  1.1384 +    int32_t w;
  1.1385 +    int dst_stride;
  1.1386 +    __m64 vsrc, vsrca;
  1.1387 +
  1.1388 +    CHECKPOINT ();
  1.1389 +
  1.1390 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.1391 +
  1.1392 +    if (src == 0)
  1.1393 +	return;
  1.1394 +
  1.1395 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.1396 +
  1.1397 +    vsrc = load8888 (&src);
  1.1398 +    vsrca = expand_alpha (vsrc);
  1.1399 +
  1.1400 +    while (height--)
  1.1401 +    {
  1.1402 +	dst = dst_line;
  1.1403 +	dst_line += dst_stride;
  1.1404 +	w = width;
  1.1405 +
  1.1406 +	CHECKPOINT ();
  1.1407 +
  1.1408 +	while (w && (uintptr_t)dst & 7)
  1.1409 +	{
  1.1410 +	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
  1.1411 +
  1.1412 +	    w--;
  1.1413 +	    dst++;
  1.1414 +	}
  1.1415 +
  1.1416 +	while (w >= 2)
  1.1417 +	{
  1.1418 +	    __m64 vdest;
  1.1419 +	    __m64 dest0, dest1;
  1.1420 +
  1.1421 +	    vdest = *(__m64 *)dst;
  1.1422 +
  1.1423 +	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
  1.1424 +	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
  1.1425 +
  1.1426 +	    *(__m64 *)dst = pack8888 (dest0, dest1);
  1.1427 +
  1.1428 +	    dst += 2;
  1.1429 +	    w -= 2;
  1.1430 +	}
  1.1431 +
  1.1432 +	CHECKPOINT ();
  1.1433 +
  1.1434 +	if (w)
  1.1435 +	{
  1.1436 +	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
  1.1437 +	}
  1.1438 +    }
  1.1439 +
  1.1440 +    _mm_empty ();
  1.1441 +}
  1.1442 +
  1.1443 +static void
  1.1444 +mmx_composite_over_n_0565 (pixman_implementation_t *imp,
  1.1445 +                           pixman_composite_info_t *info)
  1.1446 +{
  1.1447 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1448 +    uint32_t src;
  1.1449 +    uint16_t    *dst_line, *dst;
  1.1450 +    int32_t w;
  1.1451 +    int dst_stride;
  1.1452 +    __m64 vsrc, vsrca;
  1.1453 +
  1.1454 +    CHECKPOINT ();
  1.1455 +
  1.1456 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.1457 +
  1.1458 +    if (src == 0)
  1.1459 +	return;
  1.1460 +
  1.1461 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.1462 +
  1.1463 +    vsrc = load8888 (&src);
  1.1464 +    vsrca = expand_alpha (vsrc);
  1.1465 +
  1.1466 +    while (height--)
  1.1467 +    {
  1.1468 +	dst = dst_line;
  1.1469 +	dst_line += dst_stride;
  1.1470 +	w = width;
  1.1471 +
  1.1472 +	CHECKPOINT ();
  1.1473 +
  1.1474 +	while (w && (uintptr_t)dst & 7)
  1.1475 +	{
  1.1476 +	    uint64_t d = *dst;
  1.1477 +	    __m64 vdest = expand565 (to_m64 (d), 0);
  1.1478 +
  1.1479 +	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
  1.1480 +	    *dst = to_uint64 (vdest);
  1.1481 +
  1.1482 +	    w--;
  1.1483 +	    dst++;
  1.1484 +	}
  1.1485 +
  1.1486 +	while (w >= 4)
  1.1487 +	{
  1.1488 +	    __m64 vdest = *(__m64 *)dst;
  1.1489 +	    __m64 v0, v1, v2, v3;
  1.1490 +
  1.1491 +	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
  1.1492 +
  1.1493 +	    v0 = over (vsrc, vsrca, v0);
  1.1494 +	    v1 = over (vsrc, vsrca, v1);
  1.1495 +	    v2 = over (vsrc, vsrca, v2);
  1.1496 +	    v3 = over (vsrc, vsrca, v3);
  1.1497 +
  1.1498 +	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
  1.1499 +
  1.1500 +	    dst += 4;
  1.1501 +	    w -= 4;
  1.1502 +	}
  1.1503 +
  1.1504 +	CHECKPOINT ();
  1.1505 +
  1.1506 +	while (w)
  1.1507 +	{
  1.1508 +	    uint64_t d = *dst;
  1.1509 +	    __m64 vdest = expand565 (to_m64 (d), 0);
  1.1510 +
  1.1511 +	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
  1.1512 +	    *dst = to_uint64 (vdest);
  1.1513 +
  1.1514 +	    w--;
  1.1515 +	    dst++;
  1.1516 +	}
  1.1517 +    }
  1.1518 +
  1.1519 +    _mm_empty ();
  1.1520 +}
  1.1521 +
  1.1522 +static void
  1.1523 +mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
  1.1524 +                                   pixman_composite_info_t *info)
  1.1525 +{
  1.1526 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1527 +    uint32_t src;
  1.1528 +    uint32_t    *dst_line;
  1.1529 +    uint32_t    *mask_line;
  1.1530 +    int dst_stride, mask_stride;
  1.1531 +    __m64 vsrc, vsrca;
  1.1532 +
  1.1533 +    CHECKPOINT ();
  1.1534 +
  1.1535 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.1536 +
  1.1537 +    if (src == 0)
  1.1538 +	return;
  1.1539 +
  1.1540 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.1541 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  1.1542 +
  1.1543 +    vsrc = load8888 (&src);
  1.1544 +    vsrca = expand_alpha (vsrc);
  1.1545 +
  1.1546 +    while (height--)
  1.1547 +    {
  1.1548 +	int twidth = width;
  1.1549 +	uint32_t *p = (uint32_t *)mask_line;
  1.1550 +	uint32_t *q = (uint32_t *)dst_line;
  1.1551 +
  1.1552 +	while (twidth && (uintptr_t)q & 7)
  1.1553 +	{
  1.1554 +	    uint32_t m = *(uint32_t *)p;
  1.1555 +
  1.1556 +	    if (m)
  1.1557 +	    {
  1.1558 +		__m64 vdest = load8888 (q);
  1.1559 +		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
  1.1560 +		store8888 (q, vdest);
  1.1561 +	    }
  1.1562 +
  1.1563 +	    twidth--;
  1.1564 +	    p++;
  1.1565 +	    q++;
  1.1566 +	}
  1.1567 +
  1.1568 +	while (twidth >= 2)
  1.1569 +	{
  1.1570 +	    uint32_t m0, m1;
  1.1571 +	    m0 = *p;
  1.1572 +	    m1 = *(p + 1);
  1.1573 +
  1.1574 +	    if (m0 | m1)
  1.1575 +	    {
  1.1576 +		__m64 dest0, dest1;
  1.1577 +		__m64 vdest = *(__m64 *)q;
  1.1578 +
  1.1579 +		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
  1.1580 +		                 expand8888 (vdest, 0));
  1.1581 +		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
  1.1582 +		                 expand8888 (vdest, 1));
  1.1583 +
  1.1584 +		*(__m64 *)q = pack8888 (dest0, dest1);
  1.1585 +	    }
  1.1586 +
  1.1587 +	    p += 2;
  1.1588 +	    q += 2;
  1.1589 +	    twidth -= 2;
  1.1590 +	}
  1.1591 +
  1.1592 +	if (twidth)
  1.1593 +	{
  1.1594 +	    uint32_t m = *(uint32_t *)p;
  1.1595 +
  1.1596 +	    if (m)
  1.1597 +	    {
  1.1598 +		__m64 vdest = load8888 (q);
  1.1599 +		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
  1.1600 +		store8888 (q, vdest);
  1.1601 +	    }
  1.1602 +
  1.1603 +	    twidth--;
  1.1604 +	    p++;
  1.1605 +	    q++;
  1.1606 +	}
  1.1607 +
  1.1608 +	dst_line += dst_stride;
  1.1609 +	mask_line += mask_stride;
  1.1610 +    }
  1.1611 +
  1.1612 +    _mm_empty ();
  1.1613 +}
  1.1614 +
  1.1615 +static void
  1.1616 +mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
  1.1617 +                                pixman_composite_info_t *info)
  1.1618 +{
  1.1619 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1620 +    uint32_t    *dst_line, *dst;
  1.1621 +    uint32_t    *src_line, *src;
  1.1622 +    uint32_t mask;
  1.1623 +    __m64 vmask;
  1.1624 +    int dst_stride, src_stride;
  1.1625 +    int32_t w;
  1.1626 +
  1.1627 +    CHECKPOINT ();
  1.1628 +
  1.1629 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.1630 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.1631 +
  1.1632 +    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
  1.1633 +    vmask = expand_alpha (load8888 (&mask));
  1.1634 +
  1.1635 +    while (height--)
  1.1636 +    {
  1.1637 +	dst = dst_line;
  1.1638 +	dst_line += dst_stride;
  1.1639 +	src = src_line;
  1.1640 +	src_line += src_stride;
  1.1641 +	w = width;
  1.1642 +
  1.1643 +	while (w && (uintptr_t)dst & 7)
  1.1644 +	{
  1.1645 +	    __m64 s = load8888 (src);
  1.1646 +	    __m64 d = load8888 (dst);
  1.1647 +
  1.1648 +	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
  1.1649 +
  1.1650 +	    w--;
  1.1651 +	    dst++;
  1.1652 +	    src++;
  1.1653 +	}
  1.1654 +
  1.1655 +	while (w >= 2)
  1.1656 +	{
  1.1657 +	    __m64 vs = ldq_u ((__m64 *)src);
  1.1658 +	    __m64 vd = *(__m64 *)dst;
  1.1659 +	    __m64 vsrc0 = expand8888 (vs, 0);
  1.1660 +	    __m64 vsrc1 = expand8888 (vs, 1);
  1.1661 +
  1.1662 +	    *(__m64 *)dst = pack8888 (
  1.1663 +	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
  1.1664 +	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
  1.1665 +
  1.1666 +	    w -= 2;
  1.1667 +	    dst += 2;
  1.1668 +	    src += 2;
  1.1669 +	}
  1.1670 +
  1.1671 +	if (w)
  1.1672 +	{
  1.1673 +	    __m64 s = load8888 (src);
  1.1674 +	    __m64 d = load8888 (dst);
  1.1675 +
  1.1676 +	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
  1.1677 +	}
  1.1678 +    }
  1.1679 +
  1.1680 +    _mm_empty ();
  1.1681 +}
  1.1682 +
  1.1683 +static void
  1.1684 +mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
  1.1685 +                                pixman_composite_info_t *info)
  1.1686 +{
  1.1687 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1688 +    uint32_t *dst_line, *dst;
  1.1689 +    uint32_t *src_line, *src;
  1.1690 +    uint32_t mask;
  1.1691 +    __m64 vmask;
  1.1692 +    int dst_stride, src_stride;
  1.1693 +    int32_t w;
  1.1694 +    __m64 srca;
  1.1695 +
  1.1696 +    CHECKPOINT ();
  1.1697 +
  1.1698 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.1699 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.1700 +    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
  1.1701 +
  1.1702 +    vmask = expand_alpha (load8888 (&mask));
  1.1703 +    srca = MC (4x00ff);
  1.1704 +
  1.1705 +    while (height--)
  1.1706 +    {
  1.1707 +	dst = dst_line;
  1.1708 +	dst_line += dst_stride;
  1.1709 +	src = src_line;
  1.1710 +	src_line += src_stride;
  1.1711 +	w = width;
  1.1712 +
  1.1713 +	while (w && (uintptr_t)dst & 7)
  1.1714 +	{
  1.1715 +	    uint32_t ssrc = *src | 0xff000000;
  1.1716 +	    __m64 s = load8888 (&ssrc);
  1.1717 +	    __m64 d = load8888 (dst);
  1.1718 +
  1.1719 +	    store8888 (dst, in_over (s, srca, vmask, d));
  1.1720 +
  1.1721 +	    w--;
  1.1722 +	    dst++;
  1.1723 +	    src++;
  1.1724 +	}
  1.1725 +
  1.1726 +	while (w >= 16)
  1.1727 +	{
  1.1728 +	    __m64 vd0 = *(__m64 *)(dst + 0);
  1.1729 +	    __m64 vd1 = *(__m64 *)(dst + 2);
  1.1730 +	    __m64 vd2 = *(__m64 *)(dst + 4);
  1.1731 +	    __m64 vd3 = *(__m64 *)(dst + 6);
  1.1732 +	    __m64 vd4 = *(__m64 *)(dst + 8);
  1.1733 +	    __m64 vd5 = *(__m64 *)(dst + 10);
  1.1734 +	    __m64 vd6 = *(__m64 *)(dst + 12);
  1.1735 +	    __m64 vd7 = *(__m64 *)(dst + 14);
  1.1736 +
  1.1737 +	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
  1.1738 +	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
  1.1739 +	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
  1.1740 +	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
  1.1741 +	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
  1.1742 +	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
  1.1743 +	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
  1.1744 +	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
  1.1745 +
  1.1746 +	    vd0 = pack8888 (
  1.1747 +	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
  1.1748 +	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
  1.1749 +
  1.1750 +	    vd1 = pack8888 (
  1.1751 +	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
  1.1752 +	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
  1.1753 +
  1.1754 +	    vd2 = pack8888 (
  1.1755 +	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
  1.1756 +	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
  1.1757 +
  1.1758 +	    vd3 = pack8888 (
  1.1759 +	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
  1.1760 +	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
  1.1761 +
  1.1762 +	    vd4 = pack8888 (
  1.1763 +	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
  1.1764 +	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
  1.1765 +
  1.1766 +	    vd5 = pack8888 (
  1.1767 +	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
  1.1768 +	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
  1.1769 +
  1.1770 +	    vd6 = pack8888 (
  1.1771 +	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
  1.1772 +	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
  1.1773 +
  1.1774 +	    vd7 = pack8888 (
  1.1775 +	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
  1.1776 +	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
  1.1777 +
  1.1778 +	    *(__m64 *)(dst + 0) = vd0;
  1.1779 +	    *(__m64 *)(dst + 2) = vd1;
  1.1780 +	    *(__m64 *)(dst + 4) = vd2;
  1.1781 +	    *(__m64 *)(dst + 6) = vd3;
  1.1782 +	    *(__m64 *)(dst + 8) = vd4;
  1.1783 +	    *(__m64 *)(dst + 10) = vd5;
  1.1784 +	    *(__m64 *)(dst + 12) = vd6;
  1.1785 +	    *(__m64 *)(dst + 14) = vd7;
  1.1786 +
  1.1787 +	    w -= 16;
  1.1788 +	    dst += 16;
  1.1789 +	    src += 16;
  1.1790 +	}
  1.1791 +
  1.1792 +	while (w)
  1.1793 +	{
  1.1794 +	    uint32_t ssrc = *src | 0xff000000;
  1.1795 +	    __m64 s = load8888 (&ssrc);
  1.1796 +	    __m64 d = load8888 (dst);
  1.1797 +
  1.1798 +	    store8888 (dst, in_over (s, srca, vmask, d));
  1.1799 +
  1.1800 +	    w--;
  1.1801 +	    dst++;
  1.1802 +	    src++;
  1.1803 +	}
  1.1804 +    }
  1.1805 +
  1.1806 +    _mm_empty ();
  1.1807 +}
  1.1808 +
  1.1809 +static void
  1.1810 +mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
  1.1811 +                              pixman_composite_info_t *info)
  1.1812 +{
  1.1813 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1814 +    uint32_t *dst_line, *dst;
  1.1815 +    uint32_t *src_line, *src;
  1.1816 +    uint32_t s;
  1.1817 +    int dst_stride, src_stride;
  1.1818 +    uint8_t a;
  1.1819 +    int32_t w;
  1.1820 +
  1.1821 +    CHECKPOINT ();
  1.1822 +
  1.1823 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.1824 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.1825 +
  1.1826 +    while (height--)
  1.1827 +    {
  1.1828 +	dst = dst_line;
  1.1829 +	dst_line += dst_stride;
  1.1830 +	src = src_line;
  1.1831 +	src_line += src_stride;
  1.1832 +	w = width;
  1.1833 +
  1.1834 +	while (w--)
  1.1835 +	{
  1.1836 +	    s = *src++;
  1.1837 +	    a = s >> 24;
  1.1838 +
  1.1839 +	    if (a == 0xff)
  1.1840 +	    {
  1.1841 +		*dst = s;
  1.1842 +	    }
  1.1843 +	    else if (s)
  1.1844 +	    {
  1.1845 +		__m64 ms, sa;
  1.1846 +		ms = load8888 (&s);
  1.1847 +		sa = expand_alpha (ms);
  1.1848 +		store8888 (dst, over (ms, sa, load8888 (dst)));
  1.1849 +	    }
  1.1850 +
  1.1851 +	    dst++;
  1.1852 +	}
  1.1853 +    }
  1.1854 +    _mm_empty ();
  1.1855 +}
  1.1856 +
  1.1857 +static void
  1.1858 +mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
  1.1859 +                              pixman_composite_info_t *info)
  1.1860 +{
  1.1861 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1862 +    uint16_t    *dst_line, *dst;
  1.1863 +    uint32_t    *src_line, *src;
  1.1864 +    int dst_stride, src_stride;
  1.1865 +    int32_t w;
  1.1866 +
  1.1867 +    CHECKPOINT ();
  1.1868 +
  1.1869 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.1870 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.1871 +
  1.1872 +#if 0
  1.1873 +    /* FIXME */
  1.1874 +    assert (src_image->drawable == mask_image->drawable);
  1.1875 +#endif
  1.1876 +
  1.1877 +    while (height--)
  1.1878 +    {
  1.1879 +	dst = dst_line;
  1.1880 +	dst_line += dst_stride;
  1.1881 +	src = src_line;
  1.1882 +	src_line += src_stride;
  1.1883 +	w = width;
  1.1884 +
  1.1885 +	CHECKPOINT ();
  1.1886 +
  1.1887 +	while (w && (uintptr_t)dst & 7)
  1.1888 +	{
  1.1889 +	    __m64 vsrc = load8888 (src);
  1.1890 +	    uint64_t d = *dst;
  1.1891 +	    __m64 vdest = expand565 (to_m64 (d), 0);
  1.1892 +
  1.1893 +	    vdest = pack_565 (
  1.1894 +		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
  1.1895 +
  1.1896 +	    *dst = to_uint64 (vdest);
  1.1897 +
  1.1898 +	    w--;
  1.1899 +	    dst++;
  1.1900 +	    src++;
  1.1901 +	}
  1.1902 +
  1.1903 +	CHECKPOINT ();
  1.1904 +
  1.1905 +	while (w >= 4)
  1.1906 +	{
  1.1907 +	    __m64 vdest = *(__m64 *)dst;
  1.1908 +	    __m64 v0, v1, v2, v3;
  1.1909 +	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
  1.1910 +
  1.1911 +	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
  1.1912 +
  1.1913 +	    vsrc0 = load8888 ((src + 0));
  1.1914 +	    vsrc1 = load8888 ((src + 1));
  1.1915 +	    vsrc2 = load8888 ((src + 2));
  1.1916 +	    vsrc3 = load8888 ((src + 3));
  1.1917 +
  1.1918 +	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
  1.1919 +	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
  1.1920 +	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
  1.1921 +	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
  1.1922 +
  1.1923 +	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
  1.1924 +
  1.1925 +	    w -= 4;
  1.1926 +	    dst += 4;
  1.1927 +	    src += 4;
  1.1928 +	}
  1.1929 +
  1.1930 +	CHECKPOINT ();
  1.1931 +
  1.1932 +	while (w)
  1.1933 +	{
  1.1934 +	    __m64 vsrc = load8888 (src);
  1.1935 +	    uint64_t d = *dst;
  1.1936 +	    __m64 vdest = expand565 (to_m64 (d), 0);
  1.1937 +
  1.1938 +	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
  1.1939 +
  1.1940 +	    *dst = to_uint64 (vdest);
  1.1941 +
  1.1942 +	    w--;
  1.1943 +	    dst++;
  1.1944 +	    src++;
  1.1945 +	}
  1.1946 +    }
  1.1947 +
  1.1948 +    _mm_empty ();
  1.1949 +}
  1.1950 +
  1.1951 +static void
  1.1952 +mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
  1.1953 +                             pixman_composite_info_t *info)
  1.1954 +{
  1.1955 +    PIXMAN_COMPOSITE_ARGS (info);
  1.1956 +    uint32_t src, srca;
  1.1957 +    uint32_t *dst_line, *dst;
  1.1958 +    uint8_t *mask_line, *mask;
  1.1959 +    int dst_stride, mask_stride;
  1.1960 +    int32_t w;
  1.1961 +    __m64 vsrc, vsrca;
  1.1962 +    uint64_t srcsrc;
  1.1963 +
  1.1964 +    CHECKPOINT ();
  1.1965 +
  1.1966 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.1967 +
  1.1968 +    srca = src >> 24;
  1.1969 +    if (src == 0)
  1.1970 +	return;
  1.1971 +
  1.1972 +    srcsrc = (uint64_t)src << 32 | src;
  1.1973 +
  1.1974 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.1975 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.1976 +
  1.1977 +    vsrc = load8888 (&src);
  1.1978 +    vsrca = expand_alpha (vsrc);
  1.1979 +
  1.1980 +    while (height--)
  1.1981 +    {
  1.1982 +	dst = dst_line;
  1.1983 +	dst_line += dst_stride;
  1.1984 +	mask = mask_line;
  1.1985 +	mask_line += mask_stride;
  1.1986 +	w = width;
  1.1987 +
  1.1988 +	CHECKPOINT ();
  1.1989 +
  1.1990 +	while (w && (uintptr_t)dst & 7)
  1.1991 +	{
  1.1992 +	    uint64_t m = *mask;
  1.1993 +
  1.1994 +	    if (m)
  1.1995 +	    {
  1.1996 +		__m64 vdest = in_over (vsrc, vsrca,
  1.1997 +				       expand_alpha_rev (to_m64 (m)),
  1.1998 +				       load8888 (dst));
  1.1999 +
  1.2000 +		store8888 (dst, vdest);
  1.2001 +	    }
  1.2002 +
  1.2003 +	    w--;
  1.2004 +	    mask++;
  1.2005 +	    dst++;
  1.2006 +	}
  1.2007 +
  1.2008 +	CHECKPOINT ();
  1.2009 +
  1.2010 +	while (w >= 2)
  1.2011 +	{
  1.2012 +	    uint64_t m0, m1;
  1.2013 +
  1.2014 +	    m0 = *mask;
  1.2015 +	    m1 = *(mask + 1);
  1.2016 +
  1.2017 +	    if (srca == 0xff && (m0 & m1) == 0xff)
  1.2018 +	    {
  1.2019 +		*(uint64_t *)dst = srcsrc;
  1.2020 +	    }
  1.2021 +	    else if (m0 | m1)
  1.2022 +	    {
  1.2023 +		__m64 vdest;
  1.2024 +		__m64 dest0, dest1;
  1.2025 +
  1.2026 +		vdest = *(__m64 *)dst;
  1.2027 +
  1.2028 +		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
  1.2029 +				 expand8888 (vdest, 0));
  1.2030 +		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
  1.2031 +				 expand8888 (vdest, 1));
  1.2032 +
  1.2033 +		*(__m64 *)dst = pack8888 (dest0, dest1);
  1.2034 +	    }
  1.2035 +
  1.2036 +	    mask += 2;
  1.2037 +	    dst += 2;
  1.2038 +	    w -= 2;
  1.2039 +	}
  1.2040 +
  1.2041 +	CHECKPOINT ();
  1.2042 +
  1.2043 +	if (w)
  1.2044 +	{
  1.2045 +	    uint64_t m = *mask;
  1.2046 +
  1.2047 +	    if (m)
  1.2048 +	    {
  1.2049 +		__m64 vdest = load8888 (dst);
  1.2050 +
  1.2051 +		vdest = in_over (
  1.2052 +		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
  1.2053 +		store8888 (dst, vdest);
  1.2054 +	    }
  1.2055 +	}
  1.2056 +    }
  1.2057 +
  1.2058 +    _mm_empty ();
  1.2059 +}
  1.2060 +
  1.2061 +static pixman_bool_t
  1.2062 +mmx_fill (pixman_implementation_t *imp,
  1.2063 +          uint32_t *               bits,
  1.2064 +          int                      stride,
  1.2065 +          int                      bpp,
  1.2066 +          int                      x,
  1.2067 +          int                      y,
  1.2068 +          int                      width,
  1.2069 +          int                      height,
  1.2070 +          uint32_t		   filler)
  1.2071 +{
  1.2072 +    uint64_t fill;
  1.2073 +    __m64 vfill;
  1.2074 +    uint32_t byte_width;
  1.2075 +    uint8_t     *byte_line;
  1.2076 +
  1.2077 +#if defined __GNUC__ && defined USE_X86_MMX
  1.2078 +    __m64 v1, v2, v3, v4, v5, v6, v7;
  1.2079 +#endif
  1.2080 +
  1.2081 +    if (bpp != 16 && bpp != 32 && bpp != 8)
  1.2082 +	return FALSE;
  1.2083 +
  1.2084 +    if (bpp == 8)
  1.2085 +    {
  1.2086 +	stride = stride * (int) sizeof (uint32_t) / 1;
  1.2087 +	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
  1.2088 +	byte_width = width;
  1.2089 +	stride *= 1;
  1.2090 +        filler = (filler & 0xff) * 0x01010101;
  1.2091 +    }
  1.2092 +    else if (bpp == 16)
  1.2093 +    {
  1.2094 +	stride = stride * (int) sizeof (uint32_t) / 2;
  1.2095 +	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
  1.2096 +	byte_width = 2 * width;
  1.2097 +	stride *= 2;
  1.2098 +        filler = (filler & 0xffff) * 0x00010001;
  1.2099 +    }
  1.2100 +    else
  1.2101 +    {
  1.2102 +	stride = stride * (int) sizeof (uint32_t) / 4;
  1.2103 +	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
  1.2104 +	byte_width = 4 * width;
  1.2105 +	stride *= 4;
  1.2106 +    }
  1.2107 +
  1.2108 +    fill = ((uint64_t)filler << 32) | filler;
  1.2109 +    vfill = to_m64 (fill);
  1.2110 +
  1.2111 +#if defined __GNUC__ && defined USE_X86_MMX
  1.2112 +    __asm__ (
  1.2113 +        "movq		%7,	%0\n"
  1.2114 +        "movq		%7,	%1\n"
  1.2115 +        "movq		%7,	%2\n"
  1.2116 +        "movq		%7,	%3\n"
  1.2117 +        "movq		%7,	%4\n"
  1.2118 +        "movq		%7,	%5\n"
  1.2119 +        "movq		%7,	%6\n"
  1.2120 +	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
  1.2121 +	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
  1.2122 +	: "y" (vfill));
  1.2123 +#endif
  1.2124 +
  1.2125 +    while (height--)
  1.2126 +    {
  1.2127 +	int w;
  1.2128 +	uint8_t *d = byte_line;
  1.2129 +
  1.2130 +	byte_line += stride;
  1.2131 +	w = byte_width;
  1.2132 +
  1.2133 +	if (w >= 1 && ((uintptr_t)d & 1))
  1.2134 +	{
  1.2135 +	    *(uint8_t *)d = (filler & 0xff);
  1.2136 +	    w--;
  1.2137 +	    d++;
  1.2138 +	}
  1.2139 +
  1.2140 +	if (w >= 2 && ((uintptr_t)d & 3))
  1.2141 +	{
  1.2142 +	    *(uint16_t *)d = filler;
  1.2143 +	    w -= 2;
  1.2144 +	    d += 2;
  1.2145 +	}
  1.2146 +
  1.2147 +	while (w >= 4 && ((uintptr_t)d & 7))
  1.2148 +	{
  1.2149 +	    *(uint32_t *)d = filler;
  1.2150 +
  1.2151 +	    w -= 4;
  1.2152 +	    d += 4;
  1.2153 +	}
  1.2154 +
  1.2155 +	while (w >= 64)
  1.2156 +	{
  1.2157 +#if defined __GNUC__ && defined USE_X86_MMX
  1.2158 +	    __asm__ (
  1.2159 +	        "movq	%1,	  (%0)\n"
  1.2160 +	        "movq	%2,	 8(%0)\n"
  1.2161 +	        "movq	%3,	16(%0)\n"
  1.2162 +	        "movq	%4,	24(%0)\n"
  1.2163 +	        "movq	%5,	32(%0)\n"
  1.2164 +	        "movq	%6,	40(%0)\n"
  1.2165 +	        "movq	%7,	48(%0)\n"
  1.2166 +	        "movq	%8,	56(%0)\n"
  1.2167 +		:
  1.2168 +		: "r" (d),
  1.2169 +		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
  1.2170 +		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
  1.2171 +		: "memory");
  1.2172 +#else
  1.2173 +	    *(__m64*) (d +  0) = vfill;
  1.2174 +	    *(__m64*) (d +  8) = vfill;
  1.2175 +	    *(__m64*) (d + 16) = vfill;
  1.2176 +	    *(__m64*) (d + 24) = vfill;
  1.2177 +	    *(__m64*) (d + 32) = vfill;
  1.2178 +	    *(__m64*) (d + 40) = vfill;
  1.2179 +	    *(__m64*) (d + 48) = vfill;
  1.2180 +	    *(__m64*) (d + 56) = vfill;
  1.2181 +#endif
  1.2182 +	    w -= 64;
  1.2183 +	    d += 64;
  1.2184 +	}
  1.2185 +
  1.2186 +	while (w >= 4)
  1.2187 +	{
  1.2188 +	    *(uint32_t *)d = filler;
  1.2189 +
  1.2190 +	    w -= 4;
  1.2191 +	    d += 4;
  1.2192 +	}
  1.2193 +	if (w >= 2)
  1.2194 +	{
  1.2195 +	    *(uint16_t *)d = filler;
  1.2196 +	    w -= 2;
  1.2197 +	    d += 2;
  1.2198 +	}
  1.2199 +	if (w >= 1)
  1.2200 +	{
  1.2201 +	    *(uint8_t *)d = (filler & 0xff);
  1.2202 +	    w--;
  1.2203 +	    d++;
  1.2204 +	}
  1.2205 +
  1.2206 +    }
  1.2207 +
  1.2208 +    _mm_empty ();
  1.2209 +    return TRUE;
  1.2210 +}
  1.2211 +
  1.2212 +static void
  1.2213 +mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
  1.2214 +                             pixman_composite_info_t *info)
  1.2215 +{
  1.2216 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2217 +    uint16_t    *dst_line, *dst;
  1.2218 +    uint32_t    *src_line, *src, s;
  1.2219 +    int dst_stride, src_stride;
  1.2220 +    int32_t w;
  1.2221 +
  1.2222 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2223 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.2224 +
  1.2225 +    while (height--)
  1.2226 +    {
  1.2227 +	dst = dst_line;
  1.2228 +	dst_line += dst_stride;
  1.2229 +	src = src_line;
  1.2230 +	src_line += src_stride;
  1.2231 +	w = width;
  1.2232 +
  1.2233 +	while (w && (uintptr_t)dst & 7)
  1.2234 +	{
  1.2235 +	    s = *src++;
  1.2236 +	    *dst = convert_8888_to_0565 (s);
  1.2237 +	    dst++;
  1.2238 +	    w--;
  1.2239 +	}
  1.2240 +
  1.2241 +	while (w >= 4)
  1.2242 +	{
  1.2243 +	    __m64 vdest;
  1.2244 +	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
  1.2245 +	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
  1.2246 +
  1.2247 +	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
  1.2248 +
  1.2249 +	    *(__m64 *)dst = vdest;
  1.2250 +
  1.2251 +	    w -= 4;
  1.2252 +	    src += 4;
  1.2253 +	    dst += 4;
  1.2254 +	}
  1.2255 +
  1.2256 +	while (w)
  1.2257 +	{
  1.2258 +	    s = *src++;
  1.2259 +	    *dst = convert_8888_to_0565 (s);
  1.2260 +	    dst++;
  1.2261 +	    w--;
  1.2262 +	}
  1.2263 +    }
  1.2264 +
  1.2265 +    _mm_empty ();
  1.2266 +}
  1.2267 +
  1.2268 +static void
  1.2269 +mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
  1.2270 +                            pixman_composite_info_t *info)
  1.2271 +{
  1.2272 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2273 +    uint32_t src, srca;
  1.2274 +    uint32_t    *dst_line, *dst;
  1.2275 +    uint8_t     *mask_line, *mask;
  1.2276 +    int dst_stride, mask_stride;
  1.2277 +    int32_t w;
  1.2278 +    __m64 vsrc;
  1.2279 +    uint64_t srcsrc;
  1.2280 +
  1.2281 +    CHECKPOINT ();
  1.2282 +
  1.2283 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2284 +
  1.2285 +    srca = src >> 24;
  1.2286 +    if (src == 0)
  1.2287 +    {
  1.2288 +	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
  1.2289 +		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
  1.2290 +		  dest_x, dest_y, width, height, 0);
  1.2291 +	return;
  1.2292 +    }
  1.2293 +
  1.2294 +    srcsrc = (uint64_t)src << 32 | src;
  1.2295 +
  1.2296 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2297 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.2298 +
  1.2299 +    vsrc = load8888 (&src);
  1.2300 +
  1.2301 +    while (height--)
  1.2302 +    {
  1.2303 +	dst = dst_line;
  1.2304 +	dst_line += dst_stride;
  1.2305 +	mask = mask_line;
  1.2306 +	mask_line += mask_stride;
  1.2307 +	w = width;
  1.2308 +
  1.2309 +	CHECKPOINT ();
  1.2310 +
  1.2311 +	while (w && (uintptr_t)dst & 7)
  1.2312 +	{
  1.2313 +	    uint64_t m = *mask;
  1.2314 +
  1.2315 +	    if (m)
  1.2316 +	    {
  1.2317 +		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
  1.2318 +
  1.2319 +		store8888 (dst, vdest);
  1.2320 +	    }
  1.2321 +	    else
  1.2322 +	    {
  1.2323 +		*dst = 0;
  1.2324 +	    }
  1.2325 +
  1.2326 +	    w--;
  1.2327 +	    mask++;
  1.2328 +	    dst++;
  1.2329 +	}
  1.2330 +
  1.2331 +	CHECKPOINT ();
  1.2332 +
  1.2333 +	while (w >= 2)
  1.2334 +	{
  1.2335 +	    uint64_t m0, m1;
  1.2336 +	    m0 = *mask;
  1.2337 +	    m1 = *(mask + 1);
  1.2338 +
  1.2339 +	    if (srca == 0xff && (m0 & m1) == 0xff)
  1.2340 +	    {
  1.2341 +		*(uint64_t *)dst = srcsrc;
  1.2342 +	    }
  1.2343 +	    else if (m0 | m1)
  1.2344 +	    {
  1.2345 +		__m64 dest0, dest1;
  1.2346 +
  1.2347 +		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
  1.2348 +		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
  1.2349 +
  1.2350 +		*(__m64 *)dst = pack8888 (dest0, dest1);
  1.2351 +	    }
  1.2352 +	    else
  1.2353 +	    {
  1.2354 +		*(uint64_t *)dst = 0;
  1.2355 +	    }
  1.2356 +
  1.2357 +	    mask += 2;
  1.2358 +	    dst += 2;
  1.2359 +	    w -= 2;
  1.2360 +	}
  1.2361 +
  1.2362 +	CHECKPOINT ();
  1.2363 +
  1.2364 +	if (w)
  1.2365 +	{
  1.2366 +	    uint64_t m = *mask;
  1.2367 +
  1.2368 +	    if (m)
  1.2369 +	    {
  1.2370 +		__m64 vdest = load8888 (dst);
  1.2371 +
  1.2372 +		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
  1.2373 +		store8888 (dst, vdest);
  1.2374 +	    }
  1.2375 +	    else
  1.2376 +	    {
  1.2377 +		*dst = 0;
  1.2378 +	    }
  1.2379 +	}
  1.2380 +    }
  1.2381 +
  1.2382 +    _mm_empty ();
  1.2383 +}
  1.2384 +
  1.2385 +static void
  1.2386 +mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
  1.2387 +                             pixman_composite_info_t *info)
  1.2388 +{
  1.2389 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2390 +    uint32_t src, srca;
  1.2391 +    uint16_t *dst_line, *dst;
  1.2392 +    uint8_t *mask_line, *mask;
  1.2393 +    int dst_stride, mask_stride;
  1.2394 +    int32_t w;
  1.2395 +    __m64 vsrc, vsrca, tmp;
  1.2396 +    __m64 srcsrcsrcsrc;
  1.2397 +
  1.2398 +    CHECKPOINT ();
  1.2399 +
  1.2400 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2401 +
  1.2402 +    srca = src >> 24;
  1.2403 +    if (src == 0)
  1.2404 +	return;
  1.2405 +
  1.2406 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.2407 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.2408 +
  1.2409 +    vsrc = load8888 (&src);
  1.2410 +    vsrca = expand_alpha (vsrc);
  1.2411 +
  1.2412 +    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
  1.2413 +    srcsrcsrcsrc = expand_alpha_rev (tmp);
  1.2414 +
  1.2415 +    while (height--)
  1.2416 +    {
  1.2417 +	dst = dst_line;
  1.2418 +	dst_line += dst_stride;
  1.2419 +	mask = mask_line;
  1.2420 +	mask_line += mask_stride;
  1.2421 +	w = width;
  1.2422 +
  1.2423 +	CHECKPOINT ();
  1.2424 +
  1.2425 +	while (w && (uintptr_t)dst & 7)
  1.2426 +	{
  1.2427 +	    uint64_t m = *mask;
  1.2428 +
  1.2429 +	    if (m)
  1.2430 +	    {
  1.2431 +		uint64_t d = *dst;
  1.2432 +		__m64 vd = to_m64 (d);
  1.2433 +		__m64 vdest = in_over (
  1.2434 +		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
  1.2435 +
  1.2436 +		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
  1.2437 +		*dst = to_uint64 (vd);
  1.2438 +	    }
  1.2439 +
  1.2440 +	    w--;
  1.2441 +	    mask++;
  1.2442 +	    dst++;
  1.2443 +	}
  1.2444 +
  1.2445 +	CHECKPOINT ();
  1.2446 +
  1.2447 +	while (w >= 4)
  1.2448 +	{
  1.2449 +	    uint64_t m0, m1, m2, m3;
  1.2450 +	    m0 = *mask;
  1.2451 +	    m1 = *(mask + 1);
  1.2452 +	    m2 = *(mask + 2);
  1.2453 +	    m3 = *(mask + 3);
  1.2454 +
  1.2455 +	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
  1.2456 +	    {
  1.2457 +		*(__m64 *)dst = srcsrcsrcsrc;
  1.2458 +	    }
  1.2459 +	    else if (m0 | m1 | m2 | m3)
  1.2460 +	    {
  1.2461 +		__m64 vdest = *(__m64 *)dst;
  1.2462 +		__m64 v0, v1, v2, v3;
  1.2463 +		__m64 vm0, vm1, vm2, vm3;
  1.2464 +
  1.2465 +		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
  1.2466 +
  1.2467 +		vm0 = to_m64 (m0);
  1.2468 +		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
  1.2469 +
  1.2470 +		vm1 = to_m64 (m1);
  1.2471 +		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
  1.2472 +
  1.2473 +		vm2 = to_m64 (m2);
  1.2474 +		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
  1.2475 +
  1.2476 +		vm3 = to_m64 (m3);
  1.2477 +		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
  1.2478 +
  1.2479 +		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
  1.2480 +	    }
  1.2481 +
  1.2482 +	    w -= 4;
  1.2483 +	    mask += 4;
  1.2484 +	    dst += 4;
  1.2485 +	}
  1.2486 +
  1.2487 +	CHECKPOINT ();
  1.2488 +
  1.2489 +	while (w)
  1.2490 +	{
  1.2491 +	    uint64_t m = *mask;
  1.2492 +
  1.2493 +	    if (m)
  1.2494 +	    {
  1.2495 +		uint64_t d = *dst;
  1.2496 +		__m64 vd = to_m64 (d);
  1.2497 +		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
  1.2498 +				       expand565 (vd, 0));
  1.2499 +		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
  1.2500 +		*dst = to_uint64 (vd);
  1.2501 +	    }
  1.2502 +
  1.2503 +	    w--;
  1.2504 +	    mask++;
  1.2505 +	    dst++;
  1.2506 +	}
  1.2507 +    }
  1.2508 +
  1.2509 +    _mm_empty ();
  1.2510 +}
  1.2511 +
  1.2512 +static void
  1.2513 +mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
  1.2514 +                                pixman_composite_info_t *info)
  1.2515 +{
  1.2516 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2517 +    uint16_t    *dst_line, *dst;
  1.2518 +    uint32_t    *src_line, *src;
  1.2519 +    int dst_stride, src_stride;
  1.2520 +    int32_t w;
  1.2521 +
  1.2522 +    CHECKPOINT ();
  1.2523 +
  1.2524 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.2525 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2526 +
  1.2527 +#if 0
  1.2528 +    /* FIXME */
  1.2529 +    assert (src_image->drawable == mask_image->drawable);
  1.2530 +#endif
  1.2531 +
  1.2532 +    while (height--)
  1.2533 +    {
  1.2534 +	dst = dst_line;
  1.2535 +	dst_line += dst_stride;
  1.2536 +	src = src_line;
  1.2537 +	src_line += src_stride;
  1.2538 +	w = width;
  1.2539 +
  1.2540 +	CHECKPOINT ();
  1.2541 +
  1.2542 +	while (w && (uintptr_t)dst & 7)
  1.2543 +	{
  1.2544 +	    __m64 vsrc = load8888 (src);
  1.2545 +	    uint64_t d = *dst;
  1.2546 +	    __m64 vdest = expand565 (to_m64 (d), 0);
  1.2547 +
  1.2548 +	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
  1.2549 +
  1.2550 +	    *dst = to_uint64 (vdest);
  1.2551 +
  1.2552 +	    w--;
  1.2553 +	    dst++;
  1.2554 +	    src++;
  1.2555 +	}
  1.2556 +
  1.2557 +	CHECKPOINT ();
  1.2558 +
  1.2559 +	while (w >= 4)
  1.2560 +	{
  1.2561 +	    uint32_t s0, s1, s2, s3;
  1.2562 +	    unsigned char a0, a1, a2, a3;
  1.2563 +
  1.2564 +	    s0 = *src;
  1.2565 +	    s1 = *(src + 1);
  1.2566 +	    s2 = *(src + 2);
  1.2567 +	    s3 = *(src + 3);
  1.2568 +
  1.2569 +	    a0 = (s0 >> 24);
  1.2570 +	    a1 = (s1 >> 24);
  1.2571 +	    a2 = (s2 >> 24);
  1.2572 +	    a3 = (s3 >> 24);
  1.2573 +
  1.2574 +	    if ((a0 & a1 & a2 & a3) == 0xFF)
  1.2575 +	    {
  1.2576 +		__m64 v0 = invert_colors (load8888 (&s0));
  1.2577 +		__m64 v1 = invert_colors (load8888 (&s1));
  1.2578 +		__m64 v2 = invert_colors (load8888 (&s2));
  1.2579 +		__m64 v3 = invert_colors (load8888 (&s3));
  1.2580 +
  1.2581 +		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
  1.2582 +	    }
  1.2583 +	    else if (s0 | s1 | s2 | s3)
  1.2584 +	    {
  1.2585 +		__m64 vdest = *(__m64 *)dst;
  1.2586 +		__m64 v0, v1, v2, v3;
  1.2587 +
  1.2588 +		__m64 vsrc0 = load8888 (&s0);
  1.2589 +		__m64 vsrc1 = load8888 (&s1);
  1.2590 +		__m64 vsrc2 = load8888 (&s2);
  1.2591 +		__m64 vsrc3 = load8888 (&s3);
  1.2592 +
  1.2593 +		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
  1.2594 +
  1.2595 +		v0 = over_rev_non_pre (vsrc0, v0);
  1.2596 +		v1 = over_rev_non_pre (vsrc1, v1);
  1.2597 +		v2 = over_rev_non_pre (vsrc2, v2);
  1.2598 +		v3 = over_rev_non_pre (vsrc3, v3);
  1.2599 +
  1.2600 +		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
  1.2601 +	    }
  1.2602 +
  1.2603 +	    w -= 4;
  1.2604 +	    dst += 4;
  1.2605 +	    src += 4;
  1.2606 +	}
  1.2607 +
  1.2608 +	CHECKPOINT ();
  1.2609 +
  1.2610 +	while (w)
  1.2611 +	{
  1.2612 +	    __m64 vsrc = load8888 (src);
  1.2613 +	    uint64_t d = *dst;
  1.2614 +	    __m64 vdest = expand565 (to_m64 (d), 0);
  1.2615 +
  1.2616 +	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
  1.2617 +
  1.2618 +	    *dst = to_uint64 (vdest);
  1.2619 +
  1.2620 +	    w--;
  1.2621 +	    dst++;
  1.2622 +	    src++;
  1.2623 +	}
  1.2624 +    }
  1.2625 +
  1.2626 +    _mm_empty ();
  1.2627 +}
  1.2628 +
  1.2629 +static void
  1.2630 +mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
  1.2631 +                                pixman_composite_info_t *info)
  1.2632 +{
  1.2633 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2634 +    uint32_t    *dst_line, *dst;
  1.2635 +    uint32_t    *src_line, *src;
  1.2636 +    int dst_stride, src_stride;
  1.2637 +    int32_t w;
  1.2638 +
  1.2639 +    CHECKPOINT ();
  1.2640 +
  1.2641 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.2642 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.2643 +
  1.2644 +#if 0
  1.2645 +    /* FIXME */
  1.2646 +    assert (src_image->drawable == mask_image->drawable);
  1.2647 +#endif
  1.2648 +
  1.2649 +    while (height--)
  1.2650 +    {
  1.2651 +	dst = dst_line;
  1.2652 +	dst_line += dst_stride;
  1.2653 +	src = src_line;
  1.2654 +	src_line += src_stride;
  1.2655 +	w = width;
  1.2656 +
  1.2657 +	while (w && (uintptr_t)dst & 7)
  1.2658 +	{
  1.2659 +	    __m64 s = load8888 (src);
  1.2660 +	    __m64 d = load8888 (dst);
  1.2661 +
  1.2662 +	    store8888 (dst, over_rev_non_pre (s, d));
  1.2663 +
  1.2664 +	    w--;
  1.2665 +	    dst++;
  1.2666 +	    src++;
  1.2667 +	}
  1.2668 +
  1.2669 +	while (w >= 2)
  1.2670 +	{
  1.2671 +	    uint32_t s0, s1;
  1.2672 +	    unsigned char a0, a1;
  1.2673 +	    __m64 d0, d1;
  1.2674 +
  1.2675 +	    s0 = *src;
  1.2676 +	    s1 = *(src + 1);
  1.2677 +
  1.2678 +	    a0 = (s0 >> 24);
  1.2679 +	    a1 = (s1 >> 24);
  1.2680 +
  1.2681 +	    if ((a0 & a1) == 0xFF)
  1.2682 +	    {
  1.2683 +		d0 = invert_colors (load8888 (&s0));
  1.2684 +		d1 = invert_colors (load8888 (&s1));
  1.2685 +
  1.2686 +		*(__m64 *)dst = pack8888 (d0, d1);
  1.2687 +	    }
  1.2688 +	    else if (s0 | s1)
  1.2689 +	    {
  1.2690 +		__m64 vdest = *(__m64 *)dst;
  1.2691 +
  1.2692 +		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
  1.2693 +		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
  1.2694 +
  1.2695 +		*(__m64 *)dst = pack8888 (d0, d1);
  1.2696 +	    }
  1.2697 +
  1.2698 +	    w -= 2;
  1.2699 +	    dst += 2;
  1.2700 +	    src += 2;
  1.2701 +	}
  1.2702 +
  1.2703 +	if (w)
  1.2704 +	{
  1.2705 +	    __m64 s = load8888 (src);
  1.2706 +	    __m64 d = load8888 (dst);
  1.2707 +
  1.2708 +	    store8888 (dst, over_rev_non_pre (s, d));
  1.2709 +	}
  1.2710 +    }
  1.2711 +
  1.2712 +    _mm_empty ();
  1.2713 +}
  1.2714 +
  1.2715 +static void
  1.2716 +mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
  1.2717 +                                   pixman_composite_info_t *info)
  1.2718 +{
  1.2719 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2720 +    uint32_t src;
  1.2721 +    uint16_t    *dst_line;
  1.2722 +    uint32_t    *mask_line;
  1.2723 +    int dst_stride, mask_stride;
  1.2724 +    __m64 vsrc, vsrca;
  1.2725 +
  1.2726 +    CHECKPOINT ();
  1.2727 +
  1.2728 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2729 +
  1.2730 +    if (src == 0)
  1.2731 +	return;
  1.2732 +
  1.2733 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.2734 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
  1.2735 +
  1.2736 +    vsrc = load8888 (&src);
  1.2737 +    vsrca = expand_alpha (vsrc);
  1.2738 +
  1.2739 +    while (height--)
  1.2740 +    {
  1.2741 +	int twidth = width;
  1.2742 +	uint32_t *p = (uint32_t *)mask_line;
  1.2743 +	uint16_t *q = (uint16_t *)dst_line;
  1.2744 +
  1.2745 +	while (twidth && ((uintptr_t)q & 7))
  1.2746 +	{
  1.2747 +	    uint32_t m = *(uint32_t *)p;
  1.2748 +
  1.2749 +	    if (m)
  1.2750 +	    {
  1.2751 +		uint64_t d = *q;
  1.2752 +		__m64 vdest = expand565 (to_m64 (d), 0);
  1.2753 +		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
  1.2754 +		*q = to_uint64 (vdest);
  1.2755 +	    }
  1.2756 +
  1.2757 +	    twidth--;
  1.2758 +	    p++;
  1.2759 +	    q++;
  1.2760 +	}
  1.2761 +
  1.2762 +	while (twidth >= 4)
  1.2763 +	{
  1.2764 +	    uint32_t m0, m1, m2, m3;
  1.2765 +
  1.2766 +	    m0 = *p;
  1.2767 +	    m1 = *(p + 1);
  1.2768 +	    m2 = *(p + 2);
  1.2769 +	    m3 = *(p + 3);
  1.2770 +
  1.2771 +	    if ((m0 | m1 | m2 | m3))
  1.2772 +	    {
  1.2773 +		__m64 vdest = *(__m64 *)q;
  1.2774 +		__m64 v0, v1, v2, v3;
  1.2775 +
  1.2776 +		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
  1.2777 +
  1.2778 +		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
  1.2779 +		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
  1.2780 +		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
  1.2781 +		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
  1.2782 +
  1.2783 +		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
  1.2784 +	    }
  1.2785 +	    twidth -= 4;
  1.2786 +	    p += 4;
  1.2787 +	    q += 4;
  1.2788 +	}
  1.2789 +
  1.2790 +	while (twidth)
  1.2791 +	{
  1.2792 +	    uint32_t m;
  1.2793 +
  1.2794 +	    m = *(uint32_t *)p;
  1.2795 +	    if (m)
  1.2796 +	    {
  1.2797 +		uint64_t d = *q;
  1.2798 +		__m64 vdest = expand565 (to_m64 (d), 0);
  1.2799 +		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
  1.2800 +		*q = to_uint64 (vdest);
  1.2801 +	    }
  1.2802 +
  1.2803 +	    twidth--;
  1.2804 +	    p++;
  1.2805 +	    q++;
  1.2806 +	}
  1.2807 +
  1.2808 +	mask_line += mask_stride;
  1.2809 +	dst_line += dst_stride;
  1.2810 +    }
  1.2811 +
  1.2812 +    _mm_empty ();
  1.2813 +}
  1.2814 +
  1.2815 +static void
  1.2816 +mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
  1.2817 +                        pixman_composite_info_t *info)
  1.2818 +{
  1.2819 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2820 +    uint8_t *dst_line, *dst;
  1.2821 +    uint8_t *mask_line, *mask;
  1.2822 +    int dst_stride, mask_stride;
  1.2823 +    int32_t w;
  1.2824 +    uint32_t src;
  1.2825 +    uint8_t sa;
  1.2826 +    __m64 vsrc, vsrca;
  1.2827 +
  1.2828 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.2829 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.2830 +
  1.2831 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2832 +
  1.2833 +    sa = src >> 24;
  1.2834 +
  1.2835 +    vsrc = load8888 (&src);
  1.2836 +    vsrca = expand_alpha (vsrc);
  1.2837 +
  1.2838 +    while (height--)
  1.2839 +    {
  1.2840 +	dst = dst_line;
  1.2841 +	dst_line += dst_stride;
  1.2842 +	mask = mask_line;
  1.2843 +	mask_line += mask_stride;
  1.2844 +	w = width;
  1.2845 +
  1.2846 +	while (w && (uintptr_t)dst & 7)
  1.2847 +	{
  1.2848 +	    uint16_t tmp;
  1.2849 +	    uint8_t a;
  1.2850 +	    uint32_t m, d;
  1.2851 +
  1.2852 +	    a = *mask++;
  1.2853 +	    d = *dst;
  1.2854 +
  1.2855 +	    m = MUL_UN8 (sa, a, tmp);
  1.2856 +	    d = MUL_UN8 (m, d, tmp);
  1.2857 +
  1.2858 +	    *dst++ = d;
  1.2859 +	    w--;
  1.2860 +	}
  1.2861 +
  1.2862 +	while (w >= 4)
  1.2863 +	{
  1.2864 +	    __m64 vmask;
  1.2865 +	    __m64 vdest;
  1.2866 +
  1.2867 +	    vmask = load8888u ((uint32_t *)mask);
  1.2868 +	    vdest = load8888 ((uint32_t *)dst);
  1.2869 +
  1.2870 +	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
  1.2871 +
  1.2872 +	    dst += 4;
  1.2873 +	    mask += 4;
  1.2874 +	    w -= 4;
  1.2875 +	}
  1.2876 +
  1.2877 +	while (w--)
  1.2878 +	{
  1.2879 +	    uint16_t tmp;
  1.2880 +	    uint8_t a;
  1.2881 +	    uint32_t m, d;
  1.2882 +
  1.2883 +	    a = *mask++;
  1.2884 +	    d = *dst;
  1.2885 +
  1.2886 +	    m = MUL_UN8 (sa, a, tmp);
  1.2887 +	    d = MUL_UN8 (m, d, tmp);
  1.2888 +
  1.2889 +	    *dst++ = d;
  1.2890 +	}
  1.2891 +    }
  1.2892 +
  1.2893 +    _mm_empty ();
  1.2894 +}
  1.2895 +
  1.2896 +static void
  1.2897 +mmx_composite_in_8_8 (pixman_implementation_t *imp,
  1.2898 +                      pixman_composite_info_t *info)
  1.2899 +{
  1.2900 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2901 +    uint8_t     *dst_line, *dst;
  1.2902 +    uint8_t     *src_line, *src;
  1.2903 +    int src_stride, dst_stride;
  1.2904 +    int32_t w;
  1.2905 +
  1.2906 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.2907 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  1.2908 +
  1.2909 +    while (height--)
  1.2910 +    {
  1.2911 +	dst = dst_line;
  1.2912 +	dst_line += dst_stride;
  1.2913 +	src = src_line;
  1.2914 +	src_line += src_stride;
  1.2915 +	w = width;
  1.2916 +
  1.2917 +	while (w && (uintptr_t)dst & 3)
  1.2918 +	{
  1.2919 +	    uint8_t s, d;
  1.2920 +	    uint16_t tmp;
  1.2921 +
  1.2922 +	    s = *src;
  1.2923 +	    d = *dst;
  1.2924 +
  1.2925 +	    *dst = MUL_UN8 (s, d, tmp);
  1.2926 +
  1.2927 +	    src++;
  1.2928 +	    dst++;
  1.2929 +	    w--;
  1.2930 +	}
  1.2931 +
  1.2932 +	while (w >= 4)
  1.2933 +	{
  1.2934 +	    uint32_t *s = (uint32_t *)src;
  1.2935 +	    uint32_t *d = (uint32_t *)dst;
  1.2936 +
  1.2937 +	    store8888 (d, in (load8888u (s), load8888 (d)));
  1.2938 +
  1.2939 +	    w -= 4;
  1.2940 +	    dst += 4;
  1.2941 +	    src += 4;
  1.2942 +	}
  1.2943 +
  1.2944 +	while (w--)
  1.2945 +	{
  1.2946 +	    uint8_t s, d;
  1.2947 +	    uint16_t tmp;
  1.2948 +
  1.2949 +	    s = *src;
  1.2950 +	    d = *dst;
  1.2951 +
  1.2952 +	    *dst = MUL_UN8 (s, d, tmp);
  1.2953 +
  1.2954 +	    src++;
  1.2955 +	    dst++;
  1.2956 +	}
  1.2957 +    }
  1.2958 +
  1.2959 +    _mm_empty ();
  1.2960 +}
  1.2961 +
  1.2962 +static void
  1.2963 +mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
  1.2964 +			 pixman_composite_info_t *info)
  1.2965 +{
  1.2966 +    PIXMAN_COMPOSITE_ARGS (info);
  1.2967 +    uint8_t     *dst_line, *dst;
  1.2968 +    uint8_t     *mask_line, *mask;
  1.2969 +    int dst_stride, mask_stride;
  1.2970 +    int32_t w;
  1.2971 +    uint32_t src;
  1.2972 +    uint8_t sa;
  1.2973 +    __m64 vsrc, vsrca;
  1.2974 +
  1.2975 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.2976 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.2977 +
  1.2978 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.2979 +
  1.2980 +    sa = src >> 24;
  1.2981 +
  1.2982 +    if (src == 0)
  1.2983 +	return;
  1.2984 +
  1.2985 +    vsrc = load8888 (&src);
  1.2986 +    vsrca = expand_alpha (vsrc);
  1.2987 +
  1.2988 +    while (height--)
  1.2989 +    {
  1.2990 +	dst = dst_line;
  1.2991 +	dst_line += dst_stride;
  1.2992 +	mask = mask_line;
  1.2993 +	mask_line += mask_stride;
  1.2994 +	w = width;
  1.2995 +
  1.2996 +	while (w && (uintptr_t)dst & 3)
  1.2997 +	{
  1.2998 +	    uint16_t tmp;
  1.2999 +	    uint16_t a;
  1.3000 +	    uint32_t m, d;
  1.3001 +	    uint32_t r;
  1.3002 +
  1.3003 +	    a = *mask++;
  1.3004 +	    d = *dst;
  1.3005 +
  1.3006 +	    m = MUL_UN8 (sa, a, tmp);
  1.3007 +	    r = ADD_UN8 (m, d, tmp);
  1.3008 +
  1.3009 +	    *dst++ = r;
  1.3010 +	    w--;
  1.3011 +	}
  1.3012 +
  1.3013 +	while (w >= 4)
  1.3014 +	{
  1.3015 +	    __m64 vmask;
  1.3016 +	    __m64 vdest;
  1.3017 +
  1.3018 +	    vmask = load8888u ((uint32_t *)mask);
  1.3019 +	    vdest = load8888 ((uint32_t *)dst);
  1.3020 +
  1.3021 +	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
  1.3022 +
  1.3023 +	    dst += 4;
  1.3024 +	    mask += 4;
  1.3025 +	    w -= 4;
  1.3026 +	}
  1.3027 +
  1.3028 +	while (w--)
  1.3029 +	{
  1.3030 +	    uint16_t tmp;
  1.3031 +	    uint16_t a;
  1.3032 +	    uint32_t m, d;
  1.3033 +	    uint32_t r;
  1.3034 +
  1.3035 +	    a = *mask++;
  1.3036 +	    d = *dst;
  1.3037 +
  1.3038 +	    m = MUL_UN8 (sa, a, tmp);
  1.3039 +	    r = ADD_UN8 (m, d, tmp);
  1.3040 +
  1.3041 +	    *dst++ = r;
  1.3042 +	}
  1.3043 +    }
  1.3044 +
  1.3045 +    _mm_empty ();
  1.3046 +}
  1.3047 +
  1.3048 +static void
  1.3049 +mmx_composite_add_8_8 (pixman_implementation_t *imp,
  1.3050 +		       pixman_composite_info_t *info)
  1.3051 +{
  1.3052 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3053 +    uint8_t *dst_line, *dst;
  1.3054 +    uint8_t *src_line, *src;
  1.3055 +    int dst_stride, src_stride;
  1.3056 +    int32_t w;
  1.3057 +    uint8_t s, d;
  1.3058 +    uint16_t t;
  1.3059 +
  1.3060 +    CHECKPOINT ();
  1.3061 +
  1.3062 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
  1.3063 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
  1.3064 +
  1.3065 +    while (height--)
  1.3066 +    {
  1.3067 +	dst = dst_line;
  1.3068 +	dst_line += dst_stride;
  1.3069 +	src = src_line;
  1.3070 +	src_line += src_stride;
  1.3071 +	w = width;
  1.3072 +
  1.3073 +	while (w && (uintptr_t)dst & 7)
  1.3074 +	{
  1.3075 +	    s = *src;
  1.3076 +	    d = *dst;
  1.3077 +	    t = d + s;
  1.3078 +	    s = t | (0 - (t >> 8));
  1.3079 +	    *dst = s;
  1.3080 +
  1.3081 +	    dst++;
  1.3082 +	    src++;
  1.3083 +	    w--;
  1.3084 +	}
  1.3085 +
  1.3086 +	while (w >= 8)
  1.3087 +	{
  1.3088 +	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
  1.3089 +	    dst += 8;
  1.3090 +	    src += 8;
  1.3091 +	    w -= 8;
  1.3092 +	}
  1.3093 +
  1.3094 +	while (w)
  1.3095 +	{
  1.3096 +	    s = *src;
  1.3097 +	    d = *dst;
  1.3098 +	    t = d + s;
  1.3099 +	    s = t | (0 - (t >> 8));
  1.3100 +	    *dst = s;
  1.3101 +
  1.3102 +	    dst++;
  1.3103 +	    src++;
  1.3104 +	    w--;
  1.3105 +	}
  1.3106 +    }
  1.3107 +
  1.3108 +    _mm_empty ();
  1.3109 +}
  1.3110 +
  1.3111 +static void
  1.3112 +mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
  1.3113 +                             pixman_composite_info_t *info)
  1.3114 +{
  1.3115 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3116 +    uint16_t    *dst_line, *dst;
  1.3117 +    uint32_t	d;
  1.3118 +    uint16_t    *src_line, *src;
  1.3119 +    uint32_t	s;
  1.3120 +    int dst_stride, src_stride;
  1.3121 +    int32_t w;
  1.3122 +
  1.3123 +    CHECKPOINT ();
  1.3124 +
  1.3125 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
  1.3126 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
  1.3127 +
  1.3128 +    while (height--)
  1.3129 +    {
  1.3130 +	dst = dst_line;
  1.3131 +	dst_line += dst_stride;
  1.3132 +	src = src_line;
  1.3133 +	src_line += src_stride;
  1.3134 +	w = width;
  1.3135 +
  1.3136 +	while (w && (uintptr_t)dst & 7)
  1.3137 +	{
  1.3138 +	    s = *src++;
  1.3139 +	    if (s)
  1.3140 +	    {
  1.3141 +		d = *dst;
  1.3142 +		s = convert_0565_to_8888 (s);
  1.3143 +		if (d)
  1.3144 +		{
  1.3145 +		    d = convert_0565_to_8888 (d);
  1.3146 +		    UN8x4_ADD_UN8x4 (s, d);
  1.3147 +		}
  1.3148 +		*dst = convert_8888_to_0565 (s);
  1.3149 +	    }
  1.3150 +	    dst++;
  1.3151 +	    w--;
  1.3152 +	}
  1.3153 +
  1.3154 +	while (w >= 4)
  1.3155 +	{
  1.3156 +	    __m64 vdest = *(__m64 *)dst;
  1.3157 +	    __m64 vsrc = ldq_u ((__m64 *)src);
  1.3158 +	    __m64 vd0, vd1;
  1.3159 +	    __m64 vs0, vs1;
  1.3160 +
  1.3161 +	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
  1.3162 +	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
  1.3163 +
  1.3164 +	    vd0 = _mm_adds_pu8 (vd0, vs0);
  1.3165 +	    vd1 = _mm_adds_pu8 (vd1, vs1);
  1.3166 +
  1.3167 +	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
  1.3168 +
  1.3169 +	    dst += 4;
  1.3170 +	    src += 4;
  1.3171 +	    w -= 4;
  1.3172 +	}
  1.3173 +
  1.3174 +	while (w--)
  1.3175 +	{
  1.3176 +	    s = *src++;
  1.3177 +	    if (s)
  1.3178 +	    {
  1.3179 +		d = *dst;
  1.3180 +		s = convert_0565_to_8888 (s);
  1.3181 +		if (d)
  1.3182 +		{
  1.3183 +		    d = convert_0565_to_8888 (d);
  1.3184 +		    UN8x4_ADD_UN8x4 (s, d);
  1.3185 +		}
  1.3186 +		*dst = convert_8888_to_0565 (s);
  1.3187 +	    }
  1.3188 +	    dst++;
  1.3189 +	}
  1.3190 +    }
  1.3191 +
  1.3192 +    _mm_empty ();
  1.3193 +}
  1.3194 +
  1.3195 +static void
  1.3196 +mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
  1.3197 +                             pixman_composite_info_t *info)
  1.3198 +{
  1.3199 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3200 +    uint32_t    *dst_line, *dst;
  1.3201 +    uint32_t    *src_line, *src;
  1.3202 +    int dst_stride, src_stride;
  1.3203 +    int32_t w;
  1.3204 +
  1.3205 +    CHECKPOINT ();
  1.3206 +
  1.3207 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.3208 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3209 +
  1.3210 +    while (height--)
  1.3211 +    {
  1.3212 +	dst = dst_line;
  1.3213 +	dst_line += dst_stride;
  1.3214 +	src = src_line;
  1.3215 +	src_line += src_stride;
  1.3216 +	w = width;
  1.3217 +
  1.3218 +	while (w && (uintptr_t)dst & 7)
  1.3219 +	{
  1.3220 +	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
  1.3221 +	                              load ((const uint32_t *)dst)));
  1.3222 +	    dst++;
  1.3223 +	    src++;
  1.3224 +	    w--;
  1.3225 +	}
  1.3226 +
  1.3227 +	while (w >= 2)
  1.3228 +	{
  1.3229 +	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
  1.3230 +	    dst += 2;
  1.3231 +	    src += 2;
  1.3232 +	    w -= 2;
  1.3233 +	}
  1.3234 +
  1.3235 +	if (w)
  1.3236 +	{
  1.3237 +	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
  1.3238 +	                              load ((const uint32_t *)dst)));
  1.3239 +
  1.3240 +	}
  1.3241 +    }
  1.3242 +
  1.3243 +    _mm_empty ();
  1.3244 +}
  1.3245 +
  1.3246 +static pixman_bool_t
  1.3247 +mmx_blt (pixman_implementation_t *imp,
  1.3248 +         uint32_t *               src_bits,
  1.3249 +         uint32_t *               dst_bits,
  1.3250 +         int                      src_stride,
  1.3251 +         int                      dst_stride,
  1.3252 +         int                      src_bpp,
  1.3253 +         int                      dst_bpp,
  1.3254 +         int                      src_x,
  1.3255 +         int                      src_y,
  1.3256 +         int                      dest_x,
  1.3257 +         int                      dest_y,
  1.3258 +         int                      width,
  1.3259 +         int                      height)
  1.3260 +{
  1.3261 +    uint8_t *   src_bytes;
  1.3262 +    uint8_t *   dst_bytes;
  1.3263 +    int byte_width;
  1.3264 +
  1.3265 +    if (src_bpp != dst_bpp)
  1.3266 +	return FALSE;
  1.3267 +
  1.3268 +    if (src_bpp == 16)
  1.3269 +    {
  1.3270 +	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
  1.3271 +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
  1.3272 +	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
  1.3273 +	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
  1.3274 +	byte_width = 2 * width;
  1.3275 +	src_stride *= 2;
  1.3276 +	dst_stride *= 2;
  1.3277 +    }
  1.3278 +    else if (src_bpp == 32)
  1.3279 +    {
  1.3280 +	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
  1.3281 +	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
  1.3282 +	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
  1.3283 +	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
  1.3284 +	byte_width = 4 * width;
  1.3285 +	src_stride *= 4;
  1.3286 +	dst_stride *= 4;
  1.3287 +    }
  1.3288 +    else
  1.3289 +    {
  1.3290 +	return FALSE;
  1.3291 +    }
  1.3292 +
  1.3293 +    while (height--)
  1.3294 +    {
  1.3295 +	int w;
  1.3296 +	uint8_t *s = src_bytes;
  1.3297 +	uint8_t *d = dst_bytes;
  1.3298 +	src_bytes += src_stride;
  1.3299 +	dst_bytes += dst_stride;
  1.3300 +	w = byte_width;
  1.3301 +
  1.3302 +	if (w >= 1 && ((uintptr_t)d & 1))
  1.3303 +	{
  1.3304 +	    *(uint8_t *)d = *(uint8_t *)s;
  1.3305 +	    w -= 1;
  1.3306 +	    s += 1;
  1.3307 +	    d += 1;
  1.3308 +	}
  1.3309 +
  1.3310 +	if (w >= 2 && ((uintptr_t)d & 3))
  1.3311 +	{
  1.3312 +	    *(uint16_t *)d = *(uint16_t *)s;
  1.3313 +	    w -= 2;
  1.3314 +	    s += 2;
  1.3315 +	    d += 2;
  1.3316 +	}
  1.3317 +
  1.3318 +	while (w >= 4 && ((uintptr_t)d & 7))
  1.3319 +	{
  1.3320 +	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
  1.3321 +
  1.3322 +	    w -= 4;
  1.3323 +	    s += 4;
  1.3324 +	    d += 4;
  1.3325 +	}
  1.3326 +
  1.3327 +	while (w >= 64)
  1.3328 +	{
  1.3329 +#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
  1.3330 +	    __asm__ (
  1.3331 +	        "movq	  (%1),	  %%mm0\n"
  1.3332 +	        "movq	 8(%1),	  %%mm1\n"
  1.3333 +	        "movq	16(%1),	  %%mm2\n"
  1.3334 +	        "movq	24(%1),	  %%mm3\n"
  1.3335 +	        "movq	32(%1),	  %%mm4\n"
  1.3336 +	        "movq	40(%1),	  %%mm5\n"
  1.3337 +	        "movq	48(%1),	  %%mm6\n"
  1.3338 +	        "movq	56(%1),	  %%mm7\n"
  1.3339 +
  1.3340 +	        "movq	%%mm0,	  (%0)\n"
  1.3341 +	        "movq	%%mm1,	 8(%0)\n"
  1.3342 +	        "movq	%%mm2,	16(%0)\n"
  1.3343 +	        "movq	%%mm3,	24(%0)\n"
  1.3344 +	        "movq	%%mm4,	32(%0)\n"
  1.3345 +	        "movq	%%mm5,	40(%0)\n"
  1.3346 +	        "movq	%%mm6,	48(%0)\n"
  1.3347 +	        "movq	%%mm7,	56(%0)\n"
  1.3348 +		:
  1.3349 +		: "r" (d), "r" (s)
  1.3350 +		: "memory",
  1.3351 +		  "%mm0", "%mm1", "%mm2", "%mm3",
  1.3352 +		  "%mm4", "%mm5", "%mm6", "%mm7");
  1.3353 +#else
  1.3354 +	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
  1.3355 +	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
  1.3356 +	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
  1.3357 +	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
  1.3358 +	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
  1.3359 +	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
  1.3360 +	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
  1.3361 +	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
  1.3362 +	    *(__m64 *)(d + 0)  = v0;
  1.3363 +	    *(__m64 *)(d + 8)  = v1;
  1.3364 +	    *(__m64 *)(d + 16) = v2;
  1.3365 +	    *(__m64 *)(d + 24) = v3;
  1.3366 +	    *(__m64 *)(d + 32) = v4;
  1.3367 +	    *(__m64 *)(d + 40) = v5;
  1.3368 +	    *(__m64 *)(d + 48) = v6;
  1.3369 +	    *(__m64 *)(d + 56) = v7;
  1.3370 +#endif
  1.3371 +
  1.3372 +	    w -= 64;
  1.3373 +	    s += 64;
  1.3374 +	    d += 64;
  1.3375 +	}
  1.3376 +	while (w >= 4)
  1.3377 +	{
  1.3378 +	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
  1.3379 +
  1.3380 +	    w -= 4;
  1.3381 +	    s += 4;
  1.3382 +	    d += 4;
  1.3383 +	}
  1.3384 +	if (w >= 2)
  1.3385 +	{
  1.3386 +	    *(uint16_t *)d = *(uint16_t *)s;
  1.3387 +	    w -= 2;
  1.3388 +	    s += 2;
  1.3389 +	    d += 2;
  1.3390 +	}
  1.3391 +    }
  1.3392 +
  1.3393 +    _mm_empty ();
  1.3394 +
  1.3395 +    return TRUE;
  1.3396 +}
  1.3397 +
  1.3398 +static void
  1.3399 +mmx_composite_copy_area (pixman_implementation_t *imp,
  1.3400 +                         pixman_composite_info_t *info)
  1.3401 +{
  1.3402 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3403 +
  1.3404 +    mmx_blt (imp, src_image->bits.bits,
  1.3405 +	     dest_image->bits.bits,
  1.3406 +	     src_image->bits.rowstride,
  1.3407 +	     dest_image->bits.rowstride,
  1.3408 +	     PIXMAN_FORMAT_BPP (src_image->bits.format),
  1.3409 +	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
  1.3410 +	     src_x, src_y, dest_x, dest_y, width, height);
  1.3411 +}
  1.3412 +
  1.3413 +static void
  1.3414 +mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
  1.3415 +                                pixman_composite_info_t *info)
  1.3416 +{
  1.3417 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3418 +    uint32_t  *src, *src_line;
  1.3419 +    uint32_t  *dst, *dst_line;
  1.3420 +    uint8_t  *mask, *mask_line;
  1.3421 +    int src_stride, mask_stride, dst_stride;
  1.3422 +    int32_t w;
  1.3423 +
  1.3424 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3425 +    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
  1.3426 +    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
  1.3427 +
  1.3428 +    while (height--)
  1.3429 +    {
  1.3430 +	src = src_line;
  1.3431 +	src_line += src_stride;
  1.3432 +	dst = dst_line;
  1.3433 +	dst_line += dst_stride;
  1.3434 +	mask = mask_line;
  1.3435 +	mask_line += mask_stride;
  1.3436 +
  1.3437 +	w = width;
  1.3438 +
  1.3439 +	while (w--)
  1.3440 +	{
  1.3441 +	    uint64_t m = *mask;
  1.3442 +
  1.3443 +	    if (m)
  1.3444 +	    {
  1.3445 +		uint32_t ssrc = *src | 0xff000000;
  1.3446 +		__m64 s = load8888 (&ssrc);
  1.3447 +
  1.3448 +		if (m == 0xff)
  1.3449 +		{
  1.3450 +		    store8888 (dst, s);
  1.3451 +		}
  1.3452 +		else
  1.3453 +		{
  1.3454 +		    __m64 sa = expand_alpha (s);
  1.3455 +		    __m64 vm = expand_alpha_rev (to_m64 (m));
  1.3456 +		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
  1.3457 +
  1.3458 +		    store8888 (dst, vdest);
  1.3459 +		}
  1.3460 +	    }
  1.3461 +
  1.3462 +	    mask++;
  1.3463 +	    dst++;
  1.3464 +	    src++;
  1.3465 +	}
  1.3466 +    }
  1.3467 +
  1.3468 +    _mm_empty ();
  1.3469 +}
  1.3470 +
  1.3471 +static void
  1.3472 +mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
  1.3473 +                                   pixman_composite_info_t *info)
  1.3474 +{
  1.3475 +    PIXMAN_COMPOSITE_ARGS (info);
  1.3476 +    uint32_t src;
  1.3477 +    uint32_t    *dst_line, *dst;
  1.3478 +    int32_t w;
  1.3479 +    int dst_stride;
  1.3480 +    __m64 vsrc;
  1.3481 +
  1.3482 +    CHECKPOINT ();
  1.3483 +
  1.3484 +    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
  1.3485 +
  1.3486 +    if (src == 0)
  1.3487 +	return;
  1.3488 +
  1.3489 +    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
  1.3490 +
  1.3491 +    vsrc = load8888 (&src);
  1.3492 +
  1.3493 +    while (height--)
  1.3494 +    {
  1.3495 +	dst = dst_line;
  1.3496 +	dst_line += dst_stride;
  1.3497 +	w = width;
  1.3498 +
  1.3499 +	CHECKPOINT ();
  1.3500 +
  1.3501 +	while (w && (uintptr_t)dst & 7)
  1.3502 +	{
  1.3503 +	    __m64 vdest = load8888 (dst);
  1.3504 +
  1.3505 +	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
  1.3506 +
  1.3507 +	    w--;
  1.3508 +	    dst++;
  1.3509 +	}
  1.3510 +
  1.3511 +	while (w >= 2)
  1.3512 +	{
  1.3513 +	    __m64 vdest = *(__m64 *)dst;
  1.3514 +	    __m64 dest0 = expand8888 (vdest, 0);
  1.3515 +	    __m64 dest1 = expand8888 (vdest, 1);
  1.3516 +
  1.3517 +
  1.3518 +	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
  1.3519 +	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
  1.3520 +
  1.3521 +	    *(__m64 *)dst = pack8888 (dest0, dest1);
  1.3522 +
  1.3523 +	    dst += 2;
  1.3524 +	    w -= 2;
  1.3525 +	}
  1.3526 +
  1.3527 +	CHECKPOINT ();
  1.3528 +
  1.3529 +	if (w)
  1.3530 +	{
  1.3531 +	    __m64 vdest = load8888 (dst);
  1.3532 +
  1.3533 +	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
  1.3534 +	}
  1.3535 +    }
  1.3536 +
  1.3537 +    _mm_empty ();
  1.3538 +}
  1.3539 +
  1.3540 +#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
  1.3541 +#define BMSK (BSHIFT - 1)
  1.3542 +
  1.3543 +#define BILINEAR_DECLARE_VARIABLES						\
  1.3544 +    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
  1.3545 +    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
  1.3546 +    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
  1.3547 +    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
  1.3548 +    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
  1.3549 +    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
  1.3550 +    const __m64 mm_zero = _mm_setzero_si64 ();					\
  1.3551 +    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
  1.3552 +
  1.3553 +#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
  1.3554 +do {										\
  1.3555 +    /* fetch 2x2 pixel block into 2 mmx registers */				\
  1.3556 +    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
  1.3557 +    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
  1.3558 +    /* vertical interpolation */						\
  1.3559 +    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
  1.3560 +    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
  1.3561 +    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
  1.3562 +    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
  1.3563 +    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
  1.3564 +    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
  1.3565 +    vx += unit_x;								\
  1.3566 +    if (BILINEAR_INTERPOLATION_BITS < 8)					\
  1.3567 +    {										\
  1.3568 +	/* calculate horizontal weights */					\
  1.3569 +	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
  1.3570 +			  _mm_srli_pi16 (mm_x,					\
  1.3571 +					 16 - BILINEAR_INTERPOLATION_BITS)));	\
  1.3572 +	/* horizontal interpolation */						\
  1.3573 +	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
  1.3574 +	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
  1.3575 +	lo = _mm_madd_pi16 (p, mm_wh);						\
  1.3576 +	hi = _mm_madd_pi16 (q, mm_wh);						\
  1.3577 +    }										\
  1.3578 +    else									\
  1.3579 +    {										\
  1.3580 +	/* calculate horizontal weights */					\
  1.3581 +	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
  1.3582 +					16 - BILINEAR_INTERPOLATION_BITS));	\
  1.3583 +	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
  1.3584 +					16 - BILINEAR_INTERPOLATION_BITS);	\
  1.3585 +	/* horizontal interpolation */						\
  1.3586 +	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
  1.3587 +	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
  1.3588 +	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
  1.3589 +	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
  1.3590 +	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
  1.3591 +			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
  1.3592 +	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
  1.3593 +			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
  1.3594 +    }										\
  1.3595 +    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
  1.3596 +    /* shift and pack the result */						\
  1.3597 +    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
  1.3598 +    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
  1.3599 +    lo = _mm_packs_pi32 (lo, hi);						\
  1.3600 +    lo = _mm_packs_pu16 (lo, lo);						\
  1.3601 +    pix = lo;									\
  1.3602 +} while (0)
  1.3603 +
  1.3604 +#define BILINEAR_SKIP_ONE_PIXEL()						\
  1.3605 +do {										\
  1.3606 +    vx += unit_x;								\
  1.3607 +    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
  1.3608 +} while(0)
  1.3609 +
  1.3610 +static force_inline void
  1.3611 +scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
  1.3612 +					    const uint32_t * mask,
  1.3613 +					    const uint32_t * src_top,
  1.3614 +					    const uint32_t * src_bottom,
  1.3615 +					    int32_t          w,
  1.3616 +					    int              wt,
  1.3617 +					    int              wb,
  1.3618 +					    pixman_fixed_t   vx,
  1.3619 +					    pixman_fixed_t   unit_x,
  1.3620 +					    pixman_fixed_t   max_vx,
  1.3621 +					    pixman_bool_t    zero_src)
  1.3622 +{
  1.3623 +    BILINEAR_DECLARE_VARIABLES;
  1.3624 +    __m64 pix;
  1.3625 +
  1.3626 +    while (w--)
  1.3627 +    {
  1.3628 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
  1.3629 +	store (dst, pix);
  1.3630 +	dst++;
  1.3631 +    }
  1.3632 +
  1.3633 +    _mm_empty ();
  1.3634 +}
  1.3635 +
  1.3636 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
  1.3637 +			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
  1.3638 +			       uint32_t, uint32_t, uint32_t,
  1.3639 +			       COVER, FLAG_NONE)
  1.3640 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
  1.3641 +			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
  1.3642 +			       uint32_t, uint32_t, uint32_t,
  1.3643 +			       PAD, FLAG_NONE)
  1.3644 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
  1.3645 +			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
  1.3646 +			       uint32_t, uint32_t, uint32_t,
  1.3647 +			       NONE, FLAG_NONE)
  1.3648 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
  1.3649 +			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
  1.3650 +			       uint32_t, uint32_t, uint32_t,
  1.3651 +			       NORMAL, FLAG_NONE)
  1.3652 +
  1.3653 +static force_inline void
  1.3654 +scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
  1.3655 +					     const uint32_t * mask,
  1.3656 +					     const uint32_t * src_top,
  1.3657 +					     const uint32_t * src_bottom,
  1.3658 +					     int32_t          w,
  1.3659 +					     int              wt,
  1.3660 +					     int              wb,
  1.3661 +					     pixman_fixed_t   vx,
  1.3662 +					     pixman_fixed_t   unit_x,
  1.3663 +					     pixman_fixed_t   max_vx,
  1.3664 +					     pixman_bool_t    zero_src)
  1.3665 +{
  1.3666 +    BILINEAR_DECLARE_VARIABLES;
  1.3667 +    __m64 pix1, pix2;
  1.3668 +
  1.3669 +    while (w)
  1.3670 +    {
  1.3671 +	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.3672 +
  1.3673 +	if (!is_zero (pix1))
  1.3674 +	{
  1.3675 +	    pix2 = load (dst);
  1.3676 +	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
  1.3677 +	}
  1.3678 +
  1.3679 +	w--;
  1.3680 +	dst++;
  1.3681 +    }
  1.3682 +
  1.3683 +    _mm_empty ();
  1.3684 +}
  1.3685 +
  1.3686 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
  1.3687 +			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
  1.3688 +			       uint32_t, uint32_t, uint32_t,
  1.3689 +			       COVER, FLAG_NONE)
  1.3690 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
  1.3691 +			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
  1.3692 +			       uint32_t, uint32_t, uint32_t,
  1.3693 +			       PAD, FLAG_NONE)
  1.3694 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
  1.3695 +			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
  1.3696 +			       uint32_t, uint32_t, uint32_t,
  1.3697 +			       NONE, FLAG_NONE)
  1.3698 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
  1.3699 +			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
  1.3700 +			       uint32_t, uint32_t, uint32_t,
  1.3701 +			       NORMAL, FLAG_NONE)
  1.3702 +
  1.3703 +static force_inline void
  1.3704 +scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
  1.3705 +					       const uint8_t  * mask,
  1.3706 +					       const uint32_t * src_top,
  1.3707 +					       const uint32_t * src_bottom,
  1.3708 +					       int32_t          w,
  1.3709 +					       int              wt,
  1.3710 +					       int              wb,
  1.3711 +					       pixman_fixed_t   vx,
  1.3712 +					       pixman_fixed_t   unit_x,
  1.3713 +					       pixman_fixed_t   max_vx,
  1.3714 +					       pixman_bool_t    zero_src)
  1.3715 +{
  1.3716 +    BILINEAR_DECLARE_VARIABLES;
  1.3717 +    __m64 pix1, pix2;
  1.3718 +    uint32_t m;
  1.3719 +
  1.3720 +    while (w)
  1.3721 +    {
  1.3722 +	m = (uint32_t) *mask++;
  1.3723 +
  1.3724 +	if (m)
  1.3725 +	{
  1.3726 +	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
  1.3727 +
  1.3728 +	    if (m == 0xff && is_opaque (pix1))
  1.3729 +	    {
  1.3730 +		store (dst, pix1);
  1.3731 +	    }
  1.3732 +	    else
  1.3733 +	    {
  1.3734 +		__m64 ms, md, ma, msa;
  1.3735 +
  1.3736 +		pix2 = load (dst);
  1.3737 +		ma = expand_alpha_rev (to_m64 (m));
  1.3738 +		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
  1.3739 +		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
  1.3740 +
  1.3741 +		msa = expand_alpha (ms);
  1.3742 +
  1.3743 +		store8888 (dst, (in_over (ms, msa, ma, md)));
  1.3744 +	    }
  1.3745 +	}
  1.3746 +	else
  1.3747 +	{
  1.3748 +	    BILINEAR_SKIP_ONE_PIXEL ();
  1.3749 +	}
  1.3750 +
  1.3751 +	w--;
  1.3752 +	dst++;
  1.3753 +    }
  1.3754 +
  1.3755 +    _mm_empty ();
  1.3756 +}
  1.3757 +
  1.3758 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
  1.3759 +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
  1.3760 +			       uint32_t, uint8_t, uint32_t,
  1.3761 +			       COVER, FLAG_HAVE_NON_SOLID_MASK)
  1.3762 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
  1.3763 +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
  1.3764 +			       uint32_t, uint8_t, uint32_t,
  1.3765 +			       PAD, FLAG_HAVE_NON_SOLID_MASK)
  1.3766 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
  1.3767 +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
  1.3768 +			       uint32_t, uint8_t, uint32_t,
  1.3769 +			       NONE, FLAG_HAVE_NON_SOLID_MASK)
  1.3770 +FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
  1.3771 +			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
  1.3772 +			       uint32_t, uint8_t, uint32_t,
  1.3773 +			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
  1.3774 +
  1.3775 +static uint32_t *
  1.3776 +mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
  1.3777 +{
  1.3778 +    int w = iter->width;
  1.3779 +    uint32_t *dst = iter->buffer;
  1.3780 +    uint32_t *src = (uint32_t *)iter->bits;
  1.3781 +
  1.3782 +    iter->bits += iter->stride;
  1.3783 +
  1.3784 +    while (w && ((uintptr_t)dst) & 7)
  1.3785 +    {
  1.3786 +	*dst++ = (*src++) | 0xff000000;
  1.3787 +	w--;
  1.3788 +    }
  1.3789 +
  1.3790 +    while (w >= 8)
  1.3791 +    {
  1.3792 +	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
  1.3793 +	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
  1.3794 +	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
  1.3795 +	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
  1.3796 +
  1.3797 +	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
  1.3798 +	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
  1.3799 +	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
  1.3800 +	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
  1.3801 +
  1.3802 +	dst += 8;
  1.3803 +	src += 8;
  1.3804 +	w -= 8;
  1.3805 +    }
  1.3806 +
  1.3807 +    while (w)
  1.3808 +    {
  1.3809 +	*dst++ = (*src++) | 0xff000000;
  1.3810 +	w--;
  1.3811 +    }
  1.3812 +
  1.3813 +    _mm_empty ();
  1.3814 +    return iter->buffer;
  1.3815 +}
  1.3816 +
  1.3817 +static uint32_t *
  1.3818 +mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
  1.3819 +{
  1.3820 +    int w = iter->width;
  1.3821 +    uint32_t *dst = iter->buffer;
  1.3822 +    uint16_t *src = (uint16_t *)iter->bits;
  1.3823 +
  1.3824 +    iter->bits += iter->stride;
  1.3825 +
  1.3826 +    while (w && ((uintptr_t)dst) & 0x0f)
  1.3827 +    {
  1.3828 +	uint16_t s = *src++;
  1.3829 +
  1.3830 +	*dst++ = convert_0565_to_8888 (s);
  1.3831 +	w--;
  1.3832 +    }
  1.3833 +
  1.3834 +    while (w >= 4)
  1.3835 +    {
  1.3836 +	__m64 vsrc = ldq_u ((__m64 *)src);
  1.3837 +	__m64 mm0, mm1;
  1.3838 +
  1.3839 +	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
  1.3840 +
  1.3841 +	*(__m64 *)(dst + 0) = mm0;
  1.3842 +	*(__m64 *)(dst + 2) = mm1;
  1.3843 +
  1.3844 +	dst += 4;
  1.3845 +	src += 4;
  1.3846 +	w -= 4;
  1.3847 +    }
  1.3848 +
  1.3849 +    while (w)
  1.3850 +    {
  1.3851 +	uint16_t s = *src++;
  1.3852 +
  1.3853 +	*dst++ = convert_0565_to_8888 (s);
  1.3854 +	w--;
  1.3855 +    }
  1.3856 +
  1.3857 +    _mm_empty ();
  1.3858 +    return iter->buffer;
  1.3859 +}
  1.3860 +
  1.3861 +static uint32_t *
  1.3862 +mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
  1.3863 +{
  1.3864 +    int w = iter->width;
  1.3865 +    uint32_t *dst = iter->buffer;
  1.3866 +    uint8_t *src = iter->bits;
  1.3867 +
  1.3868 +    iter->bits += iter->stride;
  1.3869 +
  1.3870 +    while (w && (((uintptr_t)dst) & 15))
  1.3871 +    {
  1.3872 +        *dst++ = *(src++) << 24;
  1.3873 +        w--;
  1.3874 +    }
  1.3875 +
  1.3876 +    while (w >= 8)
  1.3877 +    {
  1.3878 +	__m64 mm0 = ldq_u ((__m64 *)src);
  1.3879 +
  1.3880 +	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
  1.3881 +	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
  1.3882 +	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
  1.3883 +	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
  1.3884 +	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
  1.3885 +	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
  1.3886 +
  1.3887 +	*(__m64 *)(dst + 0) = mm3;
  1.3888 +	*(__m64 *)(dst + 2) = mm4;
  1.3889 +	*(__m64 *)(dst + 4) = mm5;
  1.3890 +	*(__m64 *)(dst + 6) = mm6;
  1.3891 +
  1.3892 +	dst += 8;
  1.3893 +	src += 8;
  1.3894 +	w -= 8;
  1.3895 +    }
  1.3896 +
  1.3897 +    while (w)
  1.3898 +    {
  1.3899 +	*dst++ = *(src++) << 24;
  1.3900 +	w--;
  1.3901 +    }
  1.3902 +
  1.3903 +    _mm_empty ();
  1.3904 +    return iter->buffer;
  1.3905 +}
  1.3906 +
  1.3907 +typedef struct
  1.3908 +{
  1.3909 +    pixman_format_code_t	format;
  1.3910 +    pixman_iter_get_scanline_t	get_scanline;
  1.3911 +} fetcher_info_t;
  1.3912 +
  1.3913 +static const fetcher_info_t fetchers[] =
  1.3914 +{
  1.3915 +    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
  1.3916 +    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
  1.3917 +    { PIXMAN_a8,		mmx_fetch_a8 },
  1.3918 +    { PIXMAN_null }
  1.3919 +};
  1.3920 +
  1.3921 +static pixman_bool_t
  1.3922 +mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
  1.3923 +{
  1.3924 +    pixman_image_t *image = iter->image;
  1.3925 +
  1.3926 +#define FLAGS								\
  1.3927 +    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
  1.3928 +     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
  1.3929 +
  1.3930 +    if ((iter->iter_flags & ITER_NARROW)			&&
  1.3931 +	(iter->image_flags & FLAGS) == FLAGS)
  1.3932 +    {
  1.3933 +	const fetcher_info_t *f;
  1.3934 +
  1.3935 +	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
  1.3936 +	{
  1.3937 +	    if (image->common.extended_format_code == f->format)
  1.3938 +	    {
  1.3939 +		uint8_t *b = (uint8_t *)image->bits.bits;
  1.3940 +		int s = image->bits.rowstride * 4;
  1.3941 +
  1.3942 +		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
  1.3943 +		iter->stride = s;
  1.3944 +
  1.3945 +		iter->get_scanline = f->get_scanline;
  1.3946 +		return TRUE;
  1.3947 +	    }
  1.3948 +	}
  1.3949 +    }
  1.3950 +
  1.3951 +    return FALSE;
  1.3952 +}
  1.3953 +
  1.3954 +static const pixman_fast_path_t mmx_fast_paths[] =
  1.3955 +{
  1.3956 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
  1.3957 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
  1.3958 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
  1.3959 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
  1.3960 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
  1.3961 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
  1.3962 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
  1.3963 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
  1.3964 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
  1.3965 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
  1.3966 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
  1.3967 +    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
  1.3968 +    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
  1.3969 +    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
  1.3970 +    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
  1.3971 +    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
  1.3972 +    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
  1.3973 +    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
  1.3974 +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
  1.3975 +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
  1.3976 +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
  1.3977 +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
  1.3978 +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
  1.3979 +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
  1.3980 +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
  1.3981 +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
  1.3982 +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
  1.3983 +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
  1.3984 +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
  1.3985 +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
  1.3986 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
  1.3987 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
  1.3988 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
  1.3989 +    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
  1.3990 +    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
  1.3991 +    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
  1.3992 +
  1.3993 +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
  1.3994 +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
  1.3995 +    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
  1.3996 +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
  1.3997 +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
  1.3998 +    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
  1.3999 +
  1.4000 +    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
  1.4001 +    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
  1.4002 +
  1.4003 +    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
  1.4004 +    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
  1.4005 +    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
  1.4006 +    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
  1.4007 +    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
  1.4008 +    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
  1.4009 +
  1.4010 +    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
  1.4011 +    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
  1.4012 +    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
  1.4013 +    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
  1.4014 +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
  1.4015 +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
  1.4016 +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
  1.4017 +    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
  1.4018 +    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
  1.4019 +    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
  1.4020 +    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
  1.4021 +    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
  1.4022 +    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
  1.4023 +    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
  1.4024 +    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
  1.4025 +    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
  1.4026 +
  1.4027 +    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
  1.4028 +    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
  1.4029 +
  1.4030 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
  1.4031 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
  1.4032 +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
  1.4033 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
  1.4034 +    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
  1.4035 +    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
  1.4036 +
  1.4037 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
  1.4038 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
  1.4039 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
  1.4040 +    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
  1.4041 +
  1.4042 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
  1.4043 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
  1.4044 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
  1.4045 +    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
  1.4046 +
  1.4047 +    { PIXMAN_OP_NONE },
  1.4048 +};
  1.4049 +
  1.4050 +pixman_implementation_t *
  1.4051 +_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
  1.4052 +{
  1.4053 +    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
  1.4054 +
  1.4055 +    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
  1.4056 +    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
  1.4057 +    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
  1.4058 +    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
  1.4059 +    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
  1.4060 +    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
  1.4061 +    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
  1.4062 +    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
  1.4063 +    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
  1.4064 +    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
  1.4065 +    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
  1.4066 +
  1.4067 +    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
  1.4068 +    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
  1.4069 +    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
  1.4070 +    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
  1.4071 +    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
  1.4072 +    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
  1.4073 +    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
  1.4074 +    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
  1.4075 +    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
  1.4076 +    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
  1.4077 +    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
  1.4078 +
  1.4079 +    imp->blt = mmx_blt;
  1.4080 +    imp->fill = mmx_fill;
  1.4081 +
  1.4082 +    imp->src_iter_init = mmx_src_iter_init;
  1.4083 +
  1.4084 +    return imp;
  1.4085 +}
  1.4086 +
  1.4087 +#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */

mercurial