gfx/cairo/libpixman/src/pixman-mmx.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /*
michael@0 2 * Copyright © 2004, 2005 Red Hat, Inc.
michael@0 3 * Copyright © 2004 Nicholas Miell
michael@0 4 * Copyright © 2005 Trolltech AS
michael@0 5 *
michael@0 6 * Permission to use, copy, modify, distribute, and sell this software and its
michael@0 7 * documentation for any purpose is hereby granted without fee, provided that
michael@0 8 * the above copyright notice appear in all copies and that both that
michael@0 9 * copyright notice and this permission notice appear in supporting
michael@0 10 * documentation, and that the name of Red Hat not be used in advertising or
michael@0 11 * publicity pertaining to distribution of the software without specific,
michael@0 12 * written prior permission. Red Hat makes no representations about the
michael@0 13 * suitability of this software for any purpose. It is provided "as is"
michael@0 14 * without express or implied warranty.
michael@0 15 *
michael@0 16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
michael@0 17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
michael@0 18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
michael@0 19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
michael@0 20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
michael@0 21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
michael@0 22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
michael@0 23 * SOFTWARE.
michael@0 24 *
michael@0 25 * Author: Søren Sandmann (sandmann@redhat.com)
michael@0 26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
michael@0 27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
michael@0 28 *
michael@0 29 * Based on work by Owen Taylor
michael@0 30 */
michael@0 31
michael@0 32 #ifdef HAVE_CONFIG_H
michael@0 33 #include <config.h>
michael@0 34 #endif
michael@0 35
michael@0 36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
michael@0 37
michael@0 38 #ifdef USE_LOONGSON_MMI
michael@0 39 #include <loongson-mmintrin.h>
michael@0 40 #else
michael@0 41 #include <mmintrin.h>
michael@0 42 #endif
michael@0 43 #include "pixman-private.h"
michael@0 44 #include "pixman-combine32.h"
michael@0 45 #include "pixman-inlines.h"
michael@0 46
michael@0 47 #define no_vERBOSE
michael@0 48
michael@0 49 #ifdef VERBOSE
michael@0 50 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
michael@0 51 #else
michael@0 52 #define CHECKPOINT()
michael@0 53 #endif
michael@0 54
michael@0 55 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
michael@0 56 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
michael@0 57 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
michael@0 58 _mm_empty (void)
michael@0 59 {
michael@0 60
michael@0 61 }
michael@0 62 #endif
michael@0 63
michael@0 64 #ifdef USE_X86_MMX
michael@0 65 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
michael@0 66 # include <xmmintrin.h>
michael@0 67 # else
michael@0 68 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE
michael@0 69 * instructions to be generated that we don't want. Just duplicate the
michael@0 70 * functions we want to use. */
michael@0 71 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
michael@0 72 _mm_movemask_pi8 (__m64 __A)
michael@0 73 {
michael@0 74 int ret;
michael@0 75
michael@0 76 asm ("pmovmskb %1, %0\n\t"
michael@0 77 : "=r" (ret)
michael@0 78 : "y" (__A)
michael@0 79 );
michael@0 80
michael@0 81 return ret;
michael@0 82 }
michael@0 83
michael@0 84 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
michael@0 85 _mm_mulhi_pu16 (__m64 __A, __m64 __B)
michael@0 86 {
michael@0 87 asm ("pmulhuw %1, %0\n\t"
michael@0 88 : "+y" (__A)
michael@0 89 : "y" (__B)
michael@0 90 );
michael@0 91 return __A;
michael@0 92 }
michael@0 93
michael@0 94 # ifdef __OPTIMIZE__
michael@0 95 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
michael@0 96 _mm_shuffle_pi16 (__m64 __A, int8_t const __N)
michael@0 97 {
michael@0 98 __m64 ret;
michael@0 99
michael@0 100 asm ("pshufw %2, %1, %0\n\t"
michael@0 101 : "=y" (ret)
michael@0 102 : "y" (__A), "K" (__N)
michael@0 103 );
michael@0 104
michael@0 105 return ret;
michael@0 106 }
michael@0 107 # else
michael@0 108 # define _mm_shuffle_pi16(A, N) \
michael@0 109 ({ \
michael@0 110 __m64 ret; \
michael@0 111 \
michael@0 112 asm ("pshufw %2, %1, %0\n\t" \
michael@0 113 : "=y" (ret) \
michael@0 114 : "y" (A), "K" ((const int8_t)N) \
michael@0 115 ); \
michael@0 116 \
michael@0 117 ret; \
michael@0 118 })
michael@0 119 # endif
michael@0 120 # endif
michael@0 121 #endif
michael@0 122
michael@0 123 #ifndef _MSC_VER
michael@0 124 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
michael@0 125 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
michael@0 126 #endif
michael@0 127
michael@0 128 /* Notes about writing mmx code
michael@0 129 *
michael@0 130 * give memory operands as the second operand. If you give it as the
michael@0 131 * first, gcc will first load it into a register, then use that
michael@0 132 * register
michael@0 133 *
michael@0 134 * ie. use
michael@0 135 *
michael@0 136 * _mm_mullo_pi16 (x, mmx_constant);
michael@0 137 *
michael@0 138 * not
michael@0 139 *
michael@0 140 * _mm_mullo_pi16 (mmx_constant, x);
michael@0 141 *
michael@0 142 * Also try to minimize dependencies. i.e. when you need a value, try
michael@0 143 * to calculate it from a value that was calculated as early as
michael@0 144 * possible.
michael@0 145 */
michael@0 146
michael@0 147 /* --------------- MMX primitives ------------------------------------- */
michael@0 148
michael@0 149 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be
michael@0 150 * the name of the member used to access the data.
michael@0 151 * If __m64 requires using mm_cvt* intrinsics functions to convert between
michael@0 152 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
michael@0 153 * If __m64 and uint64_t values can just be cast to each other directly,
michael@0 154 * then define USE_M64_CASTS.
michael@0 155 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
michael@0 156 */
michael@0 157 #ifdef _MSC_VER
michael@0 158 # define M64_MEMBER m64_u64
michael@0 159 #elif defined(__ICC)
michael@0 160 # define USE_CVT_INTRINSICS
michael@0 161 #elif defined(USE_LOONGSON_MMI)
michael@0 162 # define USE_M64_DOUBLE
michael@0 163 #elif defined(__GNUC__)
michael@0 164 # define USE_M64_CASTS
michael@0 165 #elif defined(__SUNPRO_C)
michael@0 166 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
michael@0 167 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
michael@0 168 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
michael@0 169 * is defined. If it is used, then the mm_cvt* intrinsics must be used.
michael@0 170 */
michael@0 171 # define USE_CVT_INTRINSICS
michael@0 172 # else
michael@0 173 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
michael@0 174 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
michael@0 175 */
michael@0 176 # define M64_MEMBER l_
michael@0 177 # endif
michael@0 178 #endif
michael@0 179
michael@0 180 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
michael@0 181 typedef uint64_t mmxdatafield;
michael@0 182 #else
michael@0 183 typedef __m64 mmxdatafield;
michael@0 184 #endif
michael@0 185
michael@0 186 typedef struct
michael@0 187 {
michael@0 188 mmxdatafield mmx_4x00ff;
michael@0 189 mmxdatafield mmx_4x0080;
michael@0 190 mmxdatafield mmx_565_rgb;
michael@0 191 mmxdatafield mmx_565_unpack_multiplier;
michael@0 192 mmxdatafield mmx_565_pack_multiplier;
michael@0 193 mmxdatafield mmx_565_r;
michael@0 194 mmxdatafield mmx_565_g;
michael@0 195 mmxdatafield mmx_565_b;
michael@0 196 mmxdatafield mmx_packed_565_rb;
michael@0 197 mmxdatafield mmx_packed_565_g;
michael@0 198 mmxdatafield mmx_expand_565_g;
michael@0 199 mmxdatafield mmx_expand_565_b;
michael@0 200 mmxdatafield mmx_expand_565_r;
michael@0 201 #ifndef USE_LOONGSON_MMI
michael@0 202 mmxdatafield mmx_mask_0;
michael@0 203 mmxdatafield mmx_mask_1;
michael@0 204 mmxdatafield mmx_mask_2;
michael@0 205 mmxdatafield mmx_mask_3;
michael@0 206 #endif
michael@0 207 mmxdatafield mmx_full_alpha;
michael@0 208 mmxdatafield mmx_4x0101;
michael@0 209 mmxdatafield mmx_ff000000;
michael@0 210 } mmx_data_t;
michael@0 211
michael@0 212 #if defined(_MSC_VER)
michael@0 213 # define MMXDATA_INIT(field, val) { val ## UI64 }
michael@0 214 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
michael@0 215 # define MMXDATA_INIT(field, val) field = { val ## ULL }
michael@0 216 #else /* mmxdatafield is an integral type */
michael@0 217 # define MMXDATA_INIT(field, val) field = val ## ULL
michael@0 218 #endif
michael@0 219
michael@0 220 static const mmx_data_t c =
michael@0 221 {
michael@0 222 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
michael@0 223 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
michael@0 224 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
michael@0 225 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
michael@0 226 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
michael@0 227 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
michael@0 228 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
michael@0 229 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
michael@0 230 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
michael@0 231 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
michael@0 232 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
michael@0 233 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
michael@0 234 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
michael@0 235 #ifndef USE_LOONGSON_MMI
michael@0 236 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
michael@0 237 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
michael@0 238 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
michael@0 239 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
michael@0 240 #endif
michael@0 241 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
michael@0 242 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
michael@0 243 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
michael@0 244 };
michael@0 245
michael@0 246 #ifdef USE_CVT_INTRINSICS
michael@0 247 # define MC(x) to_m64 (c.mmx_ ## x)
michael@0 248 #elif defined(USE_M64_CASTS)
michael@0 249 # define MC(x) ((__m64)c.mmx_ ## x)
michael@0 250 #elif defined(USE_M64_DOUBLE)
michael@0 251 # define MC(x) (*(__m64 *)&c.mmx_ ## x)
michael@0 252 #else
michael@0 253 # define MC(x) c.mmx_ ## x
michael@0 254 #endif
michael@0 255
michael@0 256 static force_inline __m64
michael@0 257 to_m64 (uint64_t x)
michael@0 258 {
michael@0 259 #ifdef USE_CVT_INTRINSICS
michael@0 260 return _mm_cvtsi64_m64 (x);
michael@0 261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
michael@0 262 __m64 res;
michael@0 263
michael@0 264 res.M64_MEMBER = x;
michael@0 265 return res;
michael@0 266 #elif defined USE_M64_DOUBLE
michael@0 267 return *(__m64 *)&x;
michael@0 268 #else /* USE_M64_CASTS */
michael@0 269 return (__m64)x;
michael@0 270 #endif
michael@0 271 }
michael@0 272
michael@0 273 static force_inline uint64_t
michael@0 274 to_uint64 (__m64 x)
michael@0 275 {
michael@0 276 #ifdef USE_CVT_INTRINSICS
michael@0 277 return _mm_cvtm64_si64 (x);
michael@0 278 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
michael@0 279 uint64_t res = x.M64_MEMBER;
michael@0 280 return res;
michael@0 281 #elif defined USE_M64_DOUBLE
michael@0 282 return *(uint64_t *)&x;
michael@0 283 #else /* USE_M64_CASTS */
michael@0 284 return (uint64_t)x;
michael@0 285 #endif
michael@0 286 }
michael@0 287
michael@0 288 static force_inline __m64
michael@0 289 shift (__m64 v,
michael@0 290 int s)
michael@0 291 {
michael@0 292 if (s > 0)
michael@0 293 return _mm_slli_si64 (v, s);
michael@0 294 else if (s < 0)
michael@0 295 return _mm_srli_si64 (v, -s);
michael@0 296 else
michael@0 297 return v;
michael@0 298 }
michael@0 299
michael@0 300 static force_inline __m64
michael@0 301 negate (__m64 mask)
michael@0 302 {
michael@0 303 return _mm_xor_si64 (mask, MC (4x00ff));
michael@0 304 }
michael@0 305
michael@0 306 static force_inline __m64
michael@0 307 pix_multiply (__m64 a, __m64 b)
michael@0 308 {
michael@0 309 __m64 res;
michael@0 310
michael@0 311 res = _mm_mullo_pi16 (a, b);
michael@0 312 res = _mm_adds_pu16 (res, MC (4x0080));
michael@0 313 res = _mm_mulhi_pu16 (res, MC (4x0101));
michael@0 314
michael@0 315 return res;
michael@0 316 }
michael@0 317
michael@0 318 static force_inline __m64
michael@0 319 pix_add (__m64 a, __m64 b)
michael@0 320 {
michael@0 321 return _mm_adds_pu8 (a, b);
michael@0 322 }
michael@0 323
michael@0 324 static force_inline __m64
michael@0 325 expand_alpha (__m64 pixel)
michael@0 326 {
michael@0 327 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
michael@0 328 }
michael@0 329
michael@0 330 static force_inline __m64
michael@0 331 expand_alpha_rev (__m64 pixel)
michael@0 332 {
michael@0 333 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
michael@0 334 }
michael@0 335
michael@0 336 static force_inline __m64
michael@0 337 invert_colors (__m64 pixel)
michael@0 338 {
michael@0 339 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
michael@0 340 }
michael@0 341
michael@0 342 static force_inline __m64
michael@0 343 over (__m64 src,
michael@0 344 __m64 srca,
michael@0 345 __m64 dest)
michael@0 346 {
michael@0 347 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
michael@0 348 }
michael@0 349
michael@0 350 static force_inline __m64
michael@0 351 over_rev_non_pre (__m64 src, __m64 dest)
michael@0 352 {
michael@0 353 __m64 srca = expand_alpha (src);
michael@0 354 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
michael@0 355
michael@0 356 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
michael@0 357 }
michael@0 358
michael@0 359 static force_inline __m64
michael@0 360 in (__m64 src, __m64 mask)
michael@0 361 {
michael@0 362 return pix_multiply (src, mask);
michael@0 363 }
michael@0 364
michael@0 365 #ifndef _MSC_VER
michael@0 366 static force_inline __m64
michael@0 367 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
michael@0 368 {
michael@0 369 return over (in (src, mask), pix_multiply (srca, mask), dest);
michael@0 370 }
michael@0 371
michael@0 372 #else
michael@0 373
michael@0 374 #define in_over(src, srca, mask, dest) \
michael@0 375 over (in (src, mask), pix_multiply (srca, mask), dest)
michael@0 376
michael@0 377 #endif
michael@0 378
michael@0 379 /* Elemental unaligned loads */
michael@0 380
michael@0 381 static force_inline __m64 ldq_u(__m64 *p)
michael@0 382 {
michael@0 383 #ifdef USE_X86_MMX
michael@0 384 /* x86's alignment restrictions are very relaxed. */
michael@0 385 return *(__m64 *)p;
michael@0 386 #elif defined USE_ARM_IWMMXT
michael@0 387 int align = (uintptr_t)p & 7;
michael@0 388 __m64 *aligned_p;
michael@0 389 if (align == 0)
michael@0 390 return *p;
michael@0 391 aligned_p = (__m64 *)((uintptr_t)p & ~7);
michael@0 392 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
michael@0 393 #else
michael@0 394 struct __una_u64 { __m64 x __attribute__((packed)); };
michael@0 395 const struct __una_u64 *ptr = (const struct __una_u64 *) p;
michael@0 396 return (__m64) ptr->x;
michael@0 397 #endif
michael@0 398 }
michael@0 399
michael@0 400 static force_inline uint32_t ldl_u(const uint32_t *p)
michael@0 401 {
michael@0 402 #ifdef USE_X86_MMX
michael@0 403 /* x86's alignment restrictions are very relaxed. */
michael@0 404 return *p;
michael@0 405 #else
michael@0 406 struct __una_u32 { uint32_t x __attribute__((packed)); };
michael@0 407 const struct __una_u32 *ptr = (const struct __una_u32 *) p;
michael@0 408 return ptr->x;
michael@0 409 #endif
michael@0 410 }
michael@0 411
michael@0 412 static force_inline __m64
michael@0 413 load (const uint32_t *v)
michael@0 414 {
michael@0 415 #ifdef USE_LOONGSON_MMI
michael@0 416 __m64 ret;
michael@0 417 asm ("lwc1 %0, %1\n\t"
michael@0 418 : "=f" (ret)
michael@0 419 : "m" (*v)
michael@0 420 );
michael@0 421 return ret;
michael@0 422 #else
michael@0 423 return _mm_cvtsi32_si64 (*v);
michael@0 424 #endif
michael@0 425 }
michael@0 426
michael@0 427 static force_inline __m64
michael@0 428 load8888 (const uint32_t *v)
michael@0 429 {
michael@0 430 #ifdef USE_LOONGSON_MMI
michael@0 431 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
michael@0 432 #else
michael@0 433 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
michael@0 434 #endif
michael@0 435 }
michael@0 436
michael@0 437 static force_inline __m64
michael@0 438 load8888u (const uint32_t *v)
michael@0 439 {
michael@0 440 uint32_t l = ldl_u (v);
michael@0 441 return load8888 (&l);
michael@0 442 }
michael@0 443
michael@0 444 static force_inline __m64
michael@0 445 pack8888 (__m64 lo, __m64 hi)
michael@0 446 {
michael@0 447 return _mm_packs_pu16 (lo, hi);
michael@0 448 }
michael@0 449
michael@0 450 static force_inline void
michael@0 451 store (uint32_t *dest, __m64 v)
michael@0 452 {
michael@0 453 #ifdef USE_LOONGSON_MMI
michael@0 454 asm ("swc1 %1, %0\n\t"
michael@0 455 : "=m" (*dest)
michael@0 456 : "f" (v)
michael@0 457 : "memory"
michael@0 458 );
michael@0 459 #else
michael@0 460 *dest = _mm_cvtsi64_si32 (v);
michael@0 461 #endif
michael@0 462 }
michael@0 463
michael@0 464 static force_inline void
michael@0 465 store8888 (uint32_t *dest, __m64 v)
michael@0 466 {
michael@0 467 v = pack8888 (v, _mm_setzero_si64 ());
michael@0 468 store (dest, v);
michael@0 469 }
michael@0 470
michael@0 471 static force_inline pixman_bool_t
michael@0 472 is_equal (__m64 a, __m64 b)
michael@0 473 {
michael@0 474 #ifdef USE_LOONGSON_MMI
michael@0 475 /* __m64 is double, we can compare directly. */
michael@0 476 return a == b;
michael@0 477 #else
michael@0 478 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
michael@0 479 #endif
michael@0 480 }
michael@0 481
michael@0 482 static force_inline pixman_bool_t
michael@0 483 is_opaque (__m64 v)
michael@0 484 {
michael@0 485 #ifdef USE_LOONGSON_MMI
michael@0 486 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
michael@0 487 #else
michael@0 488 __m64 ffs = _mm_cmpeq_pi8 (v, v);
michael@0 489 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
michael@0 490 #endif
michael@0 491 }
michael@0 492
michael@0 493 static force_inline pixman_bool_t
michael@0 494 is_zero (__m64 v)
michael@0 495 {
michael@0 496 return is_equal (v, _mm_setzero_si64 ());
michael@0 497 }
michael@0 498
michael@0 499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into
michael@0 500 *
michael@0 501 * 00RR00GG00BB
michael@0 502 *
michael@0 503 * --- Expanding 565 in the low word ---
michael@0 504 *
michael@0 505 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
michael@0 506 * m = m & (01f0003f001f);
michael@0 507 * m = m * (008404100840);
michael@0 508 * m = m >> 8;
michael@0 509 *
michael@0 510 * Note the trick here - the top word is shifted by another nibble to
michael@0 511 * avoid it bumping into the middle word
michael@0 512 */
michael@0 513 static force_inline __m64
michael@0 514 expand565 (__m64 pixel, int pos)
michael@0 515 {
michael@0 516 __m64 p = pixel;
michael@0 517 __m64 t1, t2;
michael@0 518
michael@0 519 /* move pixel to low 16 bit and zero the rest */
michael@0 520 #ifdef USE_LOONGSON_MMI
michael@0 521 p = loongson_extract_pi16 (p, pos);
michael@0 522 #else
michael@0 523 p = shift (shift (p, (3 - pos) * 16), -48);
michael@0 524 #endif
michael@0 525
michael@0 526 t1 = shift (p, 36 - 11);
michael@0 527 t2 = shift (p, 16 - 5);
michael@0 528
michael@0 529 p = _mm_or_si64 (t1, p);
michael@0 530 p = _mm_or_si64 (t2, p);
michael@0 531 p = _mm_and_si64 (p, MC (565_rgb));
michael@0 532
michael@0 533 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
michael@0 534 return _mm_srli_pi16 (pixel, 8);
michael@0 535 }
michael@0 536
michael@0 537 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of
michael@0 538 *
michael@0 539 * AARRGGBBRRGGBB
michael@0 540 */
michael@0 541 static force_inline void
michael@0 542 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
michael@0 543 {
michael@0 544 __m64 t0, t1, alpha = _mm_setzero_si64 ();
michael@0 545 __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
michael@0 546 __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
michael@0 547 __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
michael@0 548 if (full_alpha)
michael@0 549 alpha = _mm_cmpeq_pi32 (alpha, alpha);
michael@0 550
michael@0 551 /* Replicate high bits into empty low bits. */
michael@0 552 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
michael@0 553 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
michael@0 554 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
michael@0 555
michael@0 556 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
michael@0 557 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
michael@0 558 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
michael@0 559
michael@0 560 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
michael@0 561 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
michael@0 562
michael@0 563 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */
michael@0 564 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */
michael@0 565 }
michael@0 566
michael@0 567 static force_inline __m64
michael@0 568 expand8888 (__m64 in, int pos)
michael@0 569 {
michael@0 570 if (pos == 0)
michael@0 571 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
michael@0 572 else
michael@0 573 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
michael@0 574 }
michael@0 575
michael@0 576 static force_inline __m64
michael@0 577 expandx888 (__m64 in, int pos)
michael@0 578 {
michael@0 579 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
michael@0 580 }
michael@0 581
michael@0 582 static force_inline void
michael@0 583 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
michael@0 584 {
michael@0 585 __m64 v0, v1;
michael@0 586 expand_4xpacked565 (vin, &v0, &v1, full_alpha);
michael@0 587 *vout0 = expand8888 (v0, 0);
michael@0 588 *vout1 = expand8888 (v0, 1);
michael@0 589 *vout2 = expand8888 (v1, 0);
michael@0 590 *vout3 = expand8888 (v1, 1);
michael@0 591 }
michael@0 592
michael@0 593 static force_inline __m64
michael@0 594 pack_565 (__m64 pixel, __m64 target, int pos)
michael@0 595 {
michael@0 596 __m64 p = pixel;
michael@0 597 __m64 t = target;
michael@0 598 __m64 r, g, b;
michael@0 599
michael@0 600 r = _mm_and_si64 (p, MC (565_r));
michael@0 601 g = _mm_and_si64 (p, MC (565_g));
michael@0 602 b = _mm_and_si64 (p, MC (565_b));
michael@0 603
michael@0 604 #ifdef USE_LOONGSON_MMI
michael@0 605 r = shift (r, -(32 - 8));
michael@0 606 g = shift (g, -(16 - 3));
michael@0 607 b = shift (b, -(0 + 3));
michael@0 608
michael@0 609 p = _mm_or_si64 (r, g);
michael@0 610 p = _mm_or_si64 (p, b);
michael@0 611 return loongson_insert_pi16 (t, p, pos);
michael@0 612 #else
michael@0 613 r = shift (r, -(32 - 8) + pos * 16);
michael@0 614 g = shift (g, -(16 - 3) + pos * 16);
michael@0 615 b = shift (b, -(0 + 3) + pos * 16);
michael@0 616
michael@0 617 if (pos == 0)
michael@0 618 t = _mm_and_si64 (t, MC (mask_0));
michael@0 619 else if (pos == 1)
michael@0 620 t = _mm_and_si64 (t, MC (mask_1));
michael@0 621 else if (pos == 2)
michael@0 622 t = _mm_and_si64 (t, MC (mask_2));
michael@0 623 else if (pos == 3)
michael@0 624 t = _mm_and_si64 (t, MC (mask_3));
michael@0 625
michael@0 626 p = _mm_or_si64 (r, t);
michael@0 627 p = _mm_or_si64 (g, p);
michael@0 628
michael@0 629 return _mm_or_si64 (b, p);
michael@0 630 #endif
michael@0 631 }
michael@0 632
michael@0 633 static force_inline __m64
michael@0 634 pack_4xpacked565 (__m64 a, __m64 b)
michael@0 635 {
michael@0 636 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
michael@0 637 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
michael@0 638
michael@0 639 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
michael@0 640 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
michael@0 641
michael@0 642 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
michael@0 643 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
michael@0 644
michael@0 645 t0 = _mm_or_si64 (t0, g0);
michael@0 646 t1 = _mm_or_si64 (t1, g1);
michael@0 647
michael@0 648 t0 = shift(t0, -5);
michael@0 649 #ifdef USE_ARM_IWMMXT
michael@0 650 t1 = shift(t1, -5);
michael@0 651 return _mm_packs_pu32 (t0, t1);
michael@0 652 #else
michael@0 653 t1 = shift(t1, -5 + 16);
michael@0 654 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
michael@0 655 #endif
michael@0 656 }
michael@0 657
michael@0 658 #ifndef _MSC_VER
michael@0 659
michael@0 660 static force_inline __m64
michael@0 661 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
michael@0 662 {
michael@0 663 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
michael@0 664 }
michael@0 665
michael@0 666 static force_inline __m64
michael@0 667 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
michael@0 668 {
michael@0 669 x = pix_multiply (x, a);
michael@0 670 y = pix_multiply (y, b);
michael@0 671
michael@0 672 return pix_add (x, y);
michael@0 673 }
michael@0 674
michael@0 675 #else
michael@0 676
michael@0 677 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */
michael@0 678
michael@0 679 #define pack_4x565(v0, v1, v2, v3) \
michael@0 680 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
michael@0 681
michael@0 682 #define pix_add_mul(x, a, y, b) \
michael@0 683 ( x = pix_multiply (x, a), \
michael@0 684 y = pix_multiply (y, b), \
michael@0 685 pix_add (x, y) )
michael@0 686
michael@0 687 #endif
michael@0 688
michael@0 689 /* --------------- MMX code patch for fbcompose.c --------------------- */
michael@0 690
michael@0 691 static force_inline __m64
michael@0 692 combine (const uint32_t *src, const uint32_t *mask)
michael@0 693 {
michael@0 694 __m64 vsrc = load8888 (src);
michael@0 695
michael@0 696 if (mask)
michael@0 697 {
michael@0 698 __m64 m = load8888 (mask);
michael@0 699
michael@0 700 m = expand_alpha (m);
michael@0 701 vsrc = pix_multiply (vsrc, m);
michael@0 702 }
michael@0 703
michael@0 704 return vsrc;
michael@0 705 }
michael@0 706
michael@0 707 static force_inline __m64
michael@0 708 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
michael@0 709 {
michael@0 710 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
michael@0 711
michael@0 712 if (is_opaque (vsrc))
michael@0 713 {
michael@0 714 return vsrc;
michael@0 715 }
michael@0 716 else if (!is_zero (vsrc))
michael@0 717 {
michael@0 718 return over (vsrc, expand_alpha (vsrc),
michael@0 719 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
michael@0 720 }
michael@0 721
michael@0 722 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
michael@0 723 }
michael@0 724
michael@0 725 static void
michael@0 726 mmx_combine_over_u (pixman_implementation_t *imp,
michael@0 727 pixman_op_t op,
michael@0 728 uint32_t * dest,
michael@0 729 const uint32_t * src,
michael@0 730 const uint32_t * mask,
michael@0 731 int width)
michael@0 732 {
michael@0 733 const uint32_t *end = dest + width;
michael@0 734
michael@0 735 while (dest < end)
michael@0 736 {
michael@0 737 __m64 vsrc = combine (src, mask);
michael@0 738
michael@0 739 if (is_opaque (vsrc))
michael@0 740 {
michael@0 741 store8888 (dest, vsrc);
michael@0 742 }
michael@0 743 else if (!is_zero (vsrc))
michael@0 744 {
michael@0 745 __m64 sa = expand_alpha (vsrc);
michael@0 746 store8888 (dest, over (vsrc, sa, load8888 (dest)));
michael@0 747 }
michael@0 748
michael@0 749 ++dest;
michael@0 750 ++src;
michael@0 751 if (mask)
michael@0 752 ++mask;
michael@0 753 }
michael@0 754 _mm_empty ();
michael@0 755 }
michael@0 756
michael@0 757 static void
michael@0 758 mmx_combine_over_reverse_u (pixman_implementation_t *imp,
michael@0 759 pixman_op_t op,
michael@0 760 uint32_t * dest,
michael@0 761 const uint32_t * src,
michael@0 762 const uint32_t * mask,
michael@0 763 int width)
michael@0 764 {
michael@0 765 const uint32_t *end = dest + width;
michael@0 766
michael@0 767 while (dest < end)
michael@0 768 {
michael@0 769 __m64 d, da;
michael@0 770 __m64 s = combine (src, mask);
michael@0 771
michael@0 772 d = load8888 (dest);
michael@0 773 da = expand_alpha (d);
michael@0 774 store8888 (dest, over (d, da, s));
michael@0 775
michael@0 776 ++dest;
michael@0 777 ++src;
michael@0 778 if (mask)
michael@0 779 mask++;
michael@0 780 }
michael@0 781 _mm_empty ();
michael@0 782 }
michael@0 783
michael@0 784 static void
michael@0 785 mmx_combine_in_u (pixman_implementation_t *imp,
michael@0 786 pixman_op_t op,
michael@0 787 uint32_t * dest,
michael@0 788 const uint32_t * src,
michael@0 789 const uint32_t * mask,
michael@0 790 int width)
michael@0 791 {
michael@0 792 const uint32_t *end = dest + width;
michael@0 793
michael@0 794 while (dest < end)
michael@0 795 {
michael@0 796 __m64 a;
michael@0 797 __m64 x = combine (src, mask);
michael@0 798
michael@0 799 a = load8888 (dest);
michael@0 800 a = expand_alpha (a);
michael@0 801 x = pix_multiply (x, a);
michael@0 802
michael@0 803 store8888 (dest, x);
michael@0 804
michael@0 805 ++dest;
michael@0 806 ++src;
michael@0 807 if (mask)
michael@0 808 mask++;
michael@0 809 }
michael@0 810 _mm_empty ();
michael@0 811 }
michael@0 812
michael@0 813 static void
michael@0 814 mmx_combine_in_reverse_u (pixman_implementation_t *imp,
michael@0 815 pixman_op_t op,
michael@0 816 uint32_t * dest,
michael@0 817 const uint32_t * src,
michael@0 818 const uint32_t * mask,
michael@0 819 int width)
michael@0 820 {
michael@0 821 const uint32_t *end = dest + width;
michael@0 822
michael@0 823 while (dest < end)
michael@0 824 {
michael@0 825 __m64 a = combine (src, mask);
michael@0 826 __m64 x;
michael@0 827
michael@0 828 x = load8888 (dest);
michael@0 829 a = expand_alpha (a);
michael@0 830 x = pix_multiply (x, a);
michael@0 831 store8888 (dest, x);
michael@0 832
michael@0 833 ++dest;
michael@0 834 ++src;
michael@0 835 if (mask)
michael@0 836 mask++;
michael@0 837 }
michael@0 838 _mm_empty ();
michael@0 839 }
michael@0 840
michael@0 841 static void
michael@0 842 mmx_combine_out_u (pixman_implementation_t *imp,
michael@0 843 pixman_op_t op,
michael@0 844 uint32_t * dest,
michael@0 845 const uint32_t * src,
michael@0 846 const uint32_t * mask,
michael@0 847 int width)
michael@0 848 {
michael@0 849 const uint32_t *end = dest + width;
michael@0 850
michael@0 851 while (dest < end)
michael@0 852 {
michael@0 853 __m64 a;
michael@0 854 __m64 x = combine (src, mask);
michael@0 855
michael@0 856 a = load8888 (dest);
michael@0 857 a = expand_alpha (a);
michael@0 858 a = negate (a);
michael@0 859 x = pix_multiply (x, a);
michael@0 860 store8888 (dest, x);
michael@0 861
michael@0 862 ++dest;
michael@0 863 ++src;
michael@0 864 if (mask)
michael@0 865 mask++;
michael@0 866 }
michael@0 867 _mm_empty ();
michael@0 868 }
michael@0 869
michael@0 870 static void
michael@0 871 mmx_combine_out_reverse_u (pixman_implementation_t *imp,
michael@0 872 pixman_op_t op,
michael@0 873 uint32_t * dest,
michael@0 874 const uint32_t * src,
michael@0 875 const uint32_t * mask,
michael@0 876 int width)
michael@0 877 {
michael@0 878 const uint32_t *end = dest + width;
michael@0 879
michael@0 880 while (dest < end)
michael@0 881 {
michael@0 882 __m64 a = combine (src, mask);
michael@0 883 __m64 x;
michael@0 884
michael@0 885 x = load8888 (dest);
michael@0 886 a = expand_alpha (a);
michael@0 887 a = negate (a);
michael@0 888 x = pix_multiply (x, a);
michael@0 889
michael@0 890 store8888 (dest, x);
michael@0 891
michael@0 892 ++dest;
michael@0 893 ++src;
michael@0 894 if (mask)
michael@0 895 mask++;
michael@0 896 }
michael@0 897 _mm_empty ();
michael@0 898 }
michael@0 899
michael@0 900 static void
michael@0 901 mmx_combine_atop_u (pixman_implementation_t *imp,
michael@0 902 pixman_op_t op,
michael@0 903 uint32_t * dest,
michael@0 904 const uint32_t * src,
michael@0 905 const uint32_t * mask,
michael@0 906 int width)
michael@0 907 {
michael@0 908 const uint32_t *end = dest + width;
michael@0 909
michael@0 910 while (dest < end)
michael@0 911 {
michael@0 912 __m64 da, d, sia;
michael@0 913 __m64 s = combine (src, mask);
michael@0 914
michael@0 915 d = load8888 (dest);
michael@0 916 sia = expand_alpha (s);
michael@0 917 sia = negate (sia);
michael@0 918 da = expand_alpha (d);
michael@0 919 s = pix_add_mul (s, da, d, sia);
michael@0 920 store8888 (dest, s);
michael@0 921
michael@0 922 ++dest;
michael@0 923 ++src;
michael@0 924 if (mask)
michael@0 925 mask++;
michael@0 926 }
michael@0 927 _mm_empty ();
michael@0 928 }
michael@0 929
michael@0 930 static void
michael@0 931 mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
michael@0 932 pixman_op_t op,
michael@0 933 uint32_t * dest,
michael@0 934 const uint32_t * src,
michael@0 935 const uint32_t * mask,
michael@0 936 int width)
michael@0 937 {
michael@0 938 const uint32_t *end;
michael@0 939
michael@0 940 end = dest + width;
michael@0 941
michael@0 942 while (dest < end)
michael@0 943 {
michael@0 944 __m64 dia, d, sa;
michael@0 945 __m64 s = combine (src, mask);
michael@0 946
michael@0 947 d = load8888 (dest);
michael@0 948 sa = expand_alpha (s);
michael@0 949 dia = expand_alpha (d);
michael@0 950 dia = negate (dia);
michael@0 951 s = pix_add_mul (s, dia, d, sa);
michael@0 952 store8888 (dest, s);
michael@0 953
michael@0 954 ++dest;
michael@0 955 ++src;
michael@0 956 if (mask)
michael@0 957 mask++;
michael@0 958 }
michael@0 959 _mm_empty ();
michael@0 960 }
michael@0 961
michael@0 962 static void
michael@0 963 mmx_combine_xor_u (pixman_implementation_t *imp,
michael@0 964 pixman_op_t op,
michael@0 965 uint32_t * dest,
michael@0 966 const uint32_t * src,
michael@0 967 const uint32_t * mask,
michael@0 968 int width)
michael@0 969 {
michael@0 970 const uint32_t *end = dest + width;
michael@0 971
michael@0 972 while (dest < end)
michael@0 973 {
michael@0 974 __m64 dia, d, sia;
michael@0 975 __m64 s = combine (src, mask);
michael@0 976
michael@0 977 d = load8888 (dest);
michael@0 978 sia = expand_alpha (s);
michael@0 979 dia = expand_alpha (d);
michael@0 980 sia = negate (sia);
michael@0 981 dia = negate (dia);
michael@0 982 s = pix_add_mul (s, dia, d, sia);
michael@0 983 store8888 (dest, s);
michael@0 984
michael@0 985 ++dest;
michael@0 986 ++src;
michael@0 987 if (mask)
michael@0 988 mask++;
michael@0 989 }
michael@0 990 _mm_empty ();
michael@0 991 }
michael@0 992
michael@0 993 static void
michael@0 994 mmx_combine_add_u (pixman_implementation_t *imp,
michael@0 995 pixman_op_t op,
michael@0 996 uint32_t * dest,
michael@0 997 const uint32_t * src,
michael@0 998 const uint32_t * mask,
michael@0 999 int width)
michael@0 1000 {
michael@0 1001 const uint32_t *end = dest + width;
michael@0 1002
michael@0 1003 while (dest < end)
michael@0 1004 {
michael@0 1005 __m64 d;
michael@0 1006 __m64 s = combine (src, mask);
michael@0 1007
michael@0 1008 d = load8888 (dest);
michael@0 1009 s = pix_add (s, d);
michael@0 1010 store8888 (dest, s);
michael@0 1011
michael@0 1012 ++dest;
michael@0 1013 ++src;
michael@0 1014 if (mask)
michael@0 1015 mask++;
michael@0 1016 }
michael@0 1017 _mm_empty ();
michael@0 1018 }
michael@0 1019
michael@0 1020 static void
michael@0 1021 mmx_combine_saturate_u (pixman_implementation_t *imp,
michael@0 1022 pixman_op_t op,
michael@0 1023 uint32_t * dest,
michael@0 1024 const uint32_t * src,
michael@0 1025 const uint32_t * mask,
michael@0 1026 int width)
michael@0 1027 {
michael@0 1028 const uint32_t *end = dest + width;
michael@0 1029
michael@0 1030 while (dest < end)
michael@0 1031 {
michael@0 1032 uint32_t s, sa, da;
michael@0 1033 uint32_t d = *dest;
michael@0 1034 __m64 ms = combine (src, mask);
michael@0 1035 __m64 md = load8888 (dest);
michael@0 1036
michael@0 1037 store8888(&s, ms);
michael@0 1038 da = ~d >> 24;
michael@0 1039 sa = s >> 24;
michael@0 1040
michael@0 1041 if (sa > da)
michael@0 1042 {
michael@0 1043 uint32_t quot = DIV_UN8 (da, sa) << 24;
michael@0 1044 __m64 msa = load8888 (&quot);
michael@0 1045 msa = expand_alpha (msa);
michael@0 1046 ms = pix_multiply (ms, msa);
michael@0 1047 }
michael@0 1048
michael@0 1049 md = pix_add (md, ms);
michael@0 1050 store8888 (dest, md);
michael@0 1051
michael@0 1052 ++src;
michael@0 1053 ++dest;
michael@0 1054 if (mask)
michael@0 1055 mask++;
michael@0 1056 }
michael@0 1057 _mm_empty ();
michael@0 1058 }
michael@0 1059
michael@0 1060 static void
michael@0 1061 mmx_combine_src_ca (pixman_implementation_t *imp,
michael@0 1062 pixman_op_t op,
michael@0 1063 uint32_t * dest,
michael@0 1064 const uint32_t * src,
michael@0 1065 const uint32_t * mask,
michael@0 1066 int width)
michael@0 1067 {
michael@0 1068 const uint32_t *end = src + width;
michael@0 1069
michael@0 1070 while (src < end)
michael@0 1071 {
michael@0 1072 __m64 a = load8888 (mask);
michael@0 1073 __m64 s = load8888 (src);
michael@0 1074
michael@0 1075 s = pix_multiply (s, a);
michael@0 1076 store8888 (dest, s);
michael@0 1077
michael@0 1078 ++src;
michael@0 1079 ++mask;
michael@0 1080 ++dest;
michael@0 1081 }
michael@0 1082 _mm_empty ();
michael@0 1083 }
michael@0 1084
michael@0 1085 static void
michael@0 1086 mmx_combine_over_ca (pixman_implementation_t *imp,
michael@0 1087 pixman_op_t op,
michael@0 1088 uint32_t * dest,
michael@0 1089 const uint32_t * src,
michael@0 1090 const uint32_t * mask,
michael@0 1091 int width)
michael@0 1092 {
michael@0 1093 const uint32_t *end = src + width;
michael@0 1094
michael@0 1095 while (src < end)
michael@0 1096 {
michael@0 1097 __m64 a = load8888 (mask);
michael@0 1098 __m64 s = load8888 (src);
michael@0 1099 __m64 d = load8888 (dest);
michael@0 1100 __m64 sa = expand_alpha (s);
michael@0 1101
michael@0 1102 store8888 (dest, in_over (s, sa, a, d));
michael@0 1103
michael@0 1104 ++src;
michael@0 1105 ++dest;
michael@0 1106 ++mask;
michael@0 1107 }
michael@0 1108 _mm_empty ();
michael@0 1109 }
michael@0 1110
michael@0 1111 static void
michael@0 1112 mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
michael@0 1113 pixman_op_t op,
michael@0 1114 uint32_t * dest,
michael@0 1115 const uint32_t * src,
michael@0 1116 const uint32_t * mask,
michael@0 1117 int width)
michael@0 1118 {
michael@0 1119 const uint32_t *end = src + width;
michael@0 1120
michael@0 1121 while (src < end)
michael@0 1122 {
michael@0 1123 __m64 a = load8888 (mask);
michael@0 1124 __m64 s = load8888 (src);
michael@0 1125 __m64 d = load8888 (dest);
michael@0 1126 __m64 da = expand_alpha (d);
michael@0 1127
michael@0 1128 store8888 (dest, over (d, da, in (s, a)));
michael@0 1129
michael@0 1130 ++src;
michael@0 1131 ++dest;
michael@0 1132 ++mask;
michael@0 1133 }
michael@0 1134 _mm_empty ();
michael@0 1135 }
michael@0 1136
michael@0 1137 static void
michael@0 1138 mmx_combine_in_ca (pixman_implementation_t *imp,
michael@0 1139 pixman_op_t op,
michael@0 1140 uint32_t * dest,
michael@0 1141 const uint32_t * src,
michael@0 1142 const uint32_t * mask,
michael@0 1143 int width)
michael@0 1144 {
michael@0 1145 const uint32_t *end = src + width;
michael@0 1146
michael@0 1147 while (src < end)
michael@0 1148 {
michael@0 1149 __m64 a = load8888 (mask);
michael@0 1150 __m64 s = load8888 (src);
michael@0 1151 __m64 d = load8888 (dest);
michael@0 1152 __m64 da = expand_alpha (d);
michael@0 1153
michael@0 1154 s = pix_multiply (s, a);
michael@0 1155 s = pix_multiply (s, da);
michael@0 1156 store8888 (dest, s);
michael@0 1157
michael@0 1158 ++src;
michael@0 1159 ++dest;
michael@0 1160 ++mask;
michael@0 1161 }
michael@0 1162 _mm_empty ();
michael@0 1163 }
michael@0 1164
michael@0 1165 static void
michael@0 1166 mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
michael@0 1167 pixman_op_t op,
michael@0 1168 uint32_t * dest,
michael@0 1169 const uint32_t * src,
michael@0 1170 const uint32_t * mask,
michael@0 1171 int width)
michael@0 1172 {
michael@0 1173 const uint32_t *end = src + width;
michael@0 1174
michael@0 1175 while (src < end)
michael@0 1176 {
michael@0 1177 __m64 a = load8888 (mask);
michael@0 1178 __m64 s = load8888 (src);
michael@0 1179 __m64 d = load8888 (dest);
michael@0 1180 __m64 sa = expand_alpha (s);
michael@0 1181
michael@0 1182 a = pix_multiply (a, sa);
michael@0 1183 d = pix_multiply (d, a);
michael@0 1184 store8888 (dest, d);
michael@0 1185
michael@0 1186 ++src;
michael@0 1187 ++dest;
michael@0 1188 ++mask;
michael@0 1189 }
michael@0 1190 _mm_empty ();
michael@0 1191 }
michael@0 1192
michael@0 1193 static void
michael@0 1194 mmx_combine_out_ca (pixman_implementation_t *imp,
michael@0 1195 pixman_op_t op,
michael@0 1196 uint32_t * dest,
michael@0 1197 const uint32_t * src,
michael@0 1198 const uint32_t * mask,
michael@0 1199 int width)
michael@0 1200 {
michael@0 1201 const uint32_t *end = src + width;
michael@0 1202
michael@0 1203 while (src < end)
michael@0 1204 {
michael@0 1205 __m64 a = load8888 (mask);
michael@0 1206 __m64 s = load8888 (src);
michael@0 1207 __m64 d = load8888 (dest);
michael@0 1208 __m64 da = expand_alpha (d);
michael@0 1209
michael@0 1210 da = negate (da);
michael@0 1211 s = pix_multiply (s, a);
michael@0 1212 s = pix_multiply (s, da);
michael@0 1213 store8888 (dest, s);
michael@0 1214
michael@0 1215 ++src;
michael@0 1216 ++dest;
michael@0 1217 ++mask;
michael@0 1218 }
michael@0 1219 _mm_empty ();
michael@0 1220 }
michael@0 1221
michael@0 1222 static void
michael@0 1223 mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
michael@0 1224 pixman_op_t op,
michael@0 1225 uint32_t * dest,
michael@0 1226 const uint32_t * src,
michael@0 1227 const uint32_t * mask,
michael@0 1228 int width)
michael@0 1229 {
michael@0 1230 const uint32_t *end = src + width;
michael@0 1231
michael@0 1232 while (src < end)
michael@0 1233 {
michael@0 1234 __m64 a = load8888 (mask);
michael@0 1235 __m64 s = load8888 (src);
michael@0 1236 __m64 d = load8888 (dest);
michael@0 1237 __m64 sa = expand_alpha (s);
michael@0 1238
michael@0 1239 a = pix_multiply (a, sa);
michael@0 1240 a = negate (a);
michael@0 1241 d = pix_multiply (d, a);
michael@0 1242 store8888 (dest, d);
michael@0 1243
michael@0 1244 ++src;
michael@0 1245 ++dest;
michael@0 1246 ++mask;
michael@0 1247 }
michael@0 1248 _mm_empty ();
michael@0 1249 }
michael@0 1250
michael@0 1251 static void
michael@0 1252 mmx_combine_atop_ca (pixman_implementation_t *imp,
michael@0 1253 pixman_op_t op,
michael@0 1254 uint32_t * dest,
michael@0 1255 const uint32_t * src,
michael@0 1256 const uint32_t * mask,
michael@0 1257 int width)
michael@0 1258 {
michael@0 1259 const uint32_t *end = src + width;
michael@0 1260
michael@0 1261 while (src < end)
michael@0 1262 {
michael@0 1263 __m64 a = load8888 (mask);
michael@0 1264 __m64 s = load8888 (src);
michael@0 1265 __m64 d = load8888 (dest);
michael@0 1266 __m64 da = expand_alpha (d);
michael@0 1267 __m64 sa = expand_alpha (s);
michael@0 1268
michael@0 1269 s = pix_multiply (s, a);
michael@0 1270 a = pix_multiply (a, sa);
michael@0 1271 a = negate (a);
michael@0 1272 d = pix_add_mul (d, a, s, da);
michael@0 1273 store8888 (dest, d);
michael@0 1274
michael@0 1275 ++src;
michael@0 1276 ++dest;
michael@0 1277 ++mask;
michael@0 1278 }
michael@0 1279 _mm_empty ();
michael@0 1280 }
michael@0 1281
michael@0 1282 static void
michael@0 1283 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
michael@0 1284 pixman_op_t op,
michael@0 1285 uint32_t * dest,
michael@0 1286 const uint32_t * src,
michael@0 1287 const uint32_t * mask,
michael@0 1288 int width)
michael@0 1289 {
michael@0 1290 const uint32_t *end = src + width;
michael@0 1291
michael@0 1292 while (src < end)
michael@0 1293 {
michael@0 1294 __m64 a = load8888 (mask);
michael@0 1295 __m64 s = load8888 (src);
michael@0 1296 __m64 d = load8888 (dest);
michael@0 1297 __m64 da = expand_alpha (d);
michael@0 1298 __m64 sa = expand_alpha (s);
michael@0 1299
michael@0 1300 s = pix_multiply (s, a);
michael@0 1301 a = pix_multiply (a, sa);
michael@0 1302 da = negate (da);
michael@0 1303 d = pix_add_mul (d, a, s, da);
michael@0 1304 store8888 (dest, d);
michael@0 1305
michael@0 1306 ++src;
michael@0 1307 ++dest;
michael@0 1308 ++mask;
michael@0 1309 }
michael@0 1310 _mm_empty ();
michael@0 1311 }
michael@0 1312
michael@0 1313 static void
michael@0 1314 mmx_combine_xor_ca (pixman_implementation_t *imp,
michael@0 1315 pixman_op_t op,
michael@0 1316 uint32_t * dest,
michael@0 1317 const uint32_t * src,
michael@0 1318 const uint32_t * mask,
michael@0 1319 int width)
michael@0 1320 {
michael@0 1321 const uint32_t *end = src + width;
michael@0 1322
michael@0 1323 while (src < end)
michael@0 1324 {
michael@0 1325 __m64 a = load8888 (mask);
michael@0 1326 __m64 s = load8888 (src);
michael@0 1327 __m64 d = load8888 (dest);
michael@0 1328 __m64 da = expand_alpha (d);
michael@0 1329 __m64 sa = expand_alpha (s);
michael@0 1330
michael@0 1331 s = pix_multiply (s, a);
michael@0 1332 a = pix_multiply (a, sa);
michael@0 1333 da = negate (da);
michael@0 1334 a = negate (a);
michael@0 1335 d = pix_add_mul (d, a, s, da);
michael@0 1336 store8888 (dest, d);
michael@0 1337
michael@0 1338 ++src;
michael@0 1339 ++dest;
michael@0 1340 ++mask;
michael@0 1341 }
michael@0 1342 _mm_empty ();
michael@0 1343 }
michael@0 1344
michael@0 1345 static void
michael@0 1346 mmx_combine_add_ca (pixman_implementation_t *imp,
michael@0 1347 pixman_op_t op,
michael@0 1348 uint32_t * dest,
michael@0 1349 const uint32_t * src,
michael@0 1350 const uint32_t * mask,
michael@0 1351 int width)
michael@0 1352 {
michael@0 1353 const uint32_t *end = src + width;
michael@0 1354
michael@0 1355 while (src < end)
michael@0 1356 {
michael@0 1357 __m64 a = load8888 (mask);
michael@0 1358 __m64 s = load8888 (src);
michael@0 1359 __m64 d = load8888 (dest);
michael@0 1360
michael@0 1361 s = pix_multiply (s, a);
michael@0 1362 d = pix_add (s, d);
michael@0 1363 store8888 (dest, d);
michael@0 1364
michael@0 1365 ++src;
michael@0 1366 ++dest;
michael@0 1367 ++mask;
michael@0 1368 }
michael@0 1369 _mm_empty ();
michael@0 1370 }
michael@0 1371
michael@0 1372 /* ------------- MMX code paths called from fbpict.c -------------------- */
michael@0 1373
michael@0 1374 static void
michael@0 1375 mmx_composite_over_n_8888 (pixman_implementation_t *imp,
michael@0 1376 pixman_composite_info_t *info)
michael@0 1377 {
michael@0 1378 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1379 uint32_t src;
michael@0 1380 uint32_t *dst_line, *dst;
michael@0 1381 int32_t w;
michael@0 1382 int dst_stride;
michael@0 1383 __m64 vsrc, vsrca;
michael@0 1384
michael@0 1385 CHECKPOINT ();
michael@0 1386
michael@0 1387 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 1388
michael@0 1389 if (src == 0)
michael@0 1390 return;
michael@0 1391
michael@0 1392 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 1393
michael@0 1394 vsrc = load8888 (&src);
michael@0 1395 vsrca = expand_alpha (vsrc);
michael@0 1396
michael@0 1397 while (height--)
michael@0 1398 {
michael@0 1399 dst = dst_line;
michael@0 1400 dst_line += dst_stride;
michael@0 1401 w = width;
michael@0 1402
michael@0 1403 CHECKPOINT ();
michael@0 1404
michael@0 1405 while (w && (uintptr_t)dst & 7)
michael@0 1406 {
michael@0 1407 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
michael@0 1408
michael@0 1409 w--;
michael@0 1410 dst++;
michael@0 1411 }
michael@0 1412
michael@0 1413 while (w >= 2)
michael@0 1414 {
michael@0 1415 __m64 vdest;
michael@0 1416 __m64 dest0, dest1;
michael@0 1417
michael@0 1418 vdest = *(__m64 *)dst;
michael@0 1419
michael@0 1420 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
michael@0 1421 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
michael@0 1422
michael@0 1423 *(__m64 *)dst = pack8888 (dest0, dest1);
michael@0 1424
michael@0 1425 dst += 2;
michael@0 1426 w -= 2;
michael@0 1427 }
michael@0 1428
michael@0 1429 CHECKPOINT ();
michael@0 1430
michael@0 1431 if (w)
michael@0 1432 {
michael@0 1433 store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
michael@0 1434 }
michael@0 1435 }
michael@0 1436
michael@0 1437 _mm_empty ();
michael@0 1438 }
michael@0 1439
michael@0 1440 static void
michael@0 1441 mmx_composite_over_n_0565 (pixman_implementation_t *imp,
michael@0 1442 pixman_composite_info_t *info)
michael@0 1443 {
michael@0 1444 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1445 uint32_t src;
michael@0 1446 uint16_t *dst_line, *dst;
michael@0 1447 int32_t w;
michael@0 1448 int dst_stride;
michael@0 1449 __m64 vsrc, vsrca;
michael@0 1450
michael@0 1451 CHECKPOINT ();
michael@0 1452
michael@0 1453 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 1454
michael@0 1455 if (src == 0)
michael@0 1456 return;
michael@0 1457
michael@0 1458 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 1459
michael@0 1460 vsrc = load8888 (&src);
michael@0 1461 vsrca = expand_alpha (vsrc);
michael@0 1462
michael@0 1463 while (height--)
michael@0 1464 {
michael@0 1465 dst = dst_line;
michael@0 1466 dst_line += dst_stride;
michael@0 1467 w = width;
michael@0 1468
michael@0 1469 CHECKPOINT ();
michael@0 1470
michael@0 1471 while (w && (uintptr_t)dst & 7)
michael@0 1472 {
michael@0 1473 uint64_t d = *dst;
michael@0 1474 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 1475
michael@0 1476 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
michael@0 1477 *dst = to_uint64 (vdest);
michael@0 1478
michael@0 1479 w--;
michael@0 1480 dst++;
michael@0 1481 }
michael@0 1482
michael@0 1483 while (w >= 4)
michael@0 1484 {
michael@0 1485 __m64 vdest = *(__m64 *)dst;
michael@0 1486 __m64 v0, v1, v2, v3;
michael@0 1487
michael@0 1488 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
michael@0 1489
michael@0 1490 v0 = over (vsrc, vsrca, v0);
michael@0 1491 v1 = over (vsrc, vsrca, v1);
michael@0 1492 v2 = over (vsrc, vsrca, v2);
michael@0 1493 v3 = over (vsrc, vsrca, v3);
michael@0 1494
michael@0 1495 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
michael@0 1496
michael@0 1497 dst += 4;
michael@0 1498 w -= 4;
michael@0 1499 }
michael@0 1500
michael@0 1501 CHECKPOINT ();
michael@0 1502
michael@0 1503 while (w)
michael@0 1504 {
michael@0 1505 uint64_t d = *dst;
michael@0 1506 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 1507
michael@0 1508 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
michael@0 1509 *dst = to_uint64 (vdest);
michael@0 1510
michael@0 1511 w--;
michael@0 1512 dst++;
michael@0 1513 }
michael@0 1514 }
michael@0 1515
michael@0 1516 _mm_empty ();
michael@0 1517 }
michael@0 1518
michael@0 1519 static void
michael@0 1520 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
michael@0 1521 pixman_composite_info_t *info)
michael@0 1522 {
michael@0 1523 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1524 uint32_t src;
michael@0 1525 uint32_t *dst_line;
michael@0 1526 uint32_t *mask_line;
michael@0 1527 int dst_stride, mask_stride;
michael@0 1528 __m64 vsrc, vsrca;
michael@0 1529
michael@0 1530 CHECKPOINT ();
michael@0 1531
michael@0 1532 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 1533
michael@0 1534 if (src == 0)
michael@0 1535 return;
michael@0 1536
michael@0 1537 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 1538 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
michael@0 1539
michael@0 1540 vsrc = load8888 (&src);
michael@0 1541 vsrca = expand_alpha (vsrc);
michael@0 1542
michael@0 1543 while (height--)
michael@0 1544 {
michael@0 1545 int twidth = width;
michael@0 1546 uint32_t *p = (uint32_t *)mask_line;
michael@0 1547 uint32_t *q = (uint32_t *)dst_line;
michael@0 1548
michael@0 1549 while (twidth && (uintptr_t)q & 7)
michael@0 1550 {
michael@0 1551 uint32_t m = *(uint32_t *)p;
michael@0 1552
michael@0 1553 if (m)
michael@0 1554 {
michael@0 1555 __m64 vdest = load8888 (q);
michael@0 1556 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
michael@0 1557 store8888 (q, vdest);
michael@0 1558 }
michael@0 1559
michael@0 1560 twidth--;
michael@0 1561 p++;
michael@0 1562 q++;
michael@0 1563 }
michael@0 1564
michael@0 1565 while (twidth >= 2)
michael@0 1566 {
michael@0 1567 uint32_t m0, m1;
michael@0 1568 m0 = *p;
michael@0 1569 m1 = *(p + 1);
michael@0 1570
michael@0 1571 if (m0 | m1)
michael@0 1572 {
michael@0 1573 __m64 dest0, dest1;
michael@0 1574 __m64 vdest = *(__m64 *)q;
michael@0 1575
michael@0 1576 dest0 = in_over (vsrc, vsrca, load8888 (&m0),
michael@0 1577 expand8888 (vdest, 0));
michael@0 1578 dest1 = in_over (vsrc, vsrca, load8888 (&m1),
michael@0 1579 expand8888 (vdest, 1));
michael@0 1580
michael@0 1581 *(__m64 *)q = pack8888 (dest0, dest1);
michael@0 1582 }
michael@0 1583
michael@0 1584 p += 2;
michael@0 1585 q += 2;
michael@0 1586 twidth -= 2;
michael@0 1587 }
michael@0 1588
michael@0 1589 if (twidth)
michael@0 1590 {
michael@0 1591 uint32_t m = *(uint32_t *)p;
michael@0 1592
michael@0 1593 if (m)
michael@0 1594 {
michael@0 1595 __m64 vdest = load8888 (q);
michael@0 1596 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
michael@0 1597 store8888 (q, vdest);
michael@0 1598 }
michael@0 1599
michael@0 1600 twidth--;
michael@0 1601 p++;
michael@0 1602 q++;
michael@0 1603 }
michael@0 1604
michael@0 1605 dst_line += dst_stride;
michael@0 1606 mask_line += mask_stride;
michael@0 1607 }
michael@0 1608
michael@0 1609 _mm_empty ();
michael@0 1610 }
michael@0 1611
michael@0 1612 static void
michael@0 1613 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
michael@0 1614 pixman_composite_info_t *info)
michael@0 1615 {
michael@0 1616 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1617 uint32_t *dst_line, *dst;
michael@0 1618 uint32_t *src_line, *src;
michael@0 1619 uint32_t mask;
michael@0 1620 __m64 vmask;
michael@0 1621 int dst_stride, src_stride;
michael@0 1622 int32_t w;
michael@0 1623
michael@0 1624 CHECKPOINT ();
michael@0 1625
michael@0 1626 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 1627 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 1628
michael@0 1629 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
michael@0 1630 vmask = expand_alpha (load8888 (&mask));
michael@0 1631
michael@0 1632 while (height--)
michael@0 1633 {
michael@0 1634 dst = dst_line;
michael@0 1635 dst_line += dst_stride;
michael@0 1636 src = src_line;
michael@0 1637 src_line += src_stride;
michael@0 1638 w = width;
michael@0 1639
michael@0 1640 while (w && (uintptr_t)dst & 7)
michael@0 1641 {
michael@0 1642 __m64 s = load8888 (src);
michael@0 1643 __m64 d = load8888 (dst);
michael@0 1644
michael@0 1645 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
michael@0 1646
michael@0 1647 w--;
michael@0 1648 dst++;
michael@0 1649 src++;
michael@0 1650 }
michael@0 1651
michael@0 1652 while (w >= 2)
michael@0 1653 {
michael@0 1654 __m64 vs = ldq_u ((__m64 *)src);
michael@0 1655 __m64 vd = *(__m64 *)dst;
michael@0 1656 __m64 vsrc0 = expand8888 (vs, 0);
michael@0 1657 __m64 vsrc1 = expand8888 (vs, 1);
michael@0 1658
michael@0 1659 *(__m64 *)dst = pack8888 (
michael@0 1660 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
michael@0 1661 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
michael@0 1662
michael@0 1663 w -= 2;
michael@0 1664 dst += 2;
michael@0 1665 src += 2;
michael@0 1666 }
michael@0 1667
michael@0 1668 if (w)
michael@0 1669 {
michael@0 1670 __m64 s = load8888 (src);
michael@0 1671 __m64 d = load8888 (dst);
michael@0 1672
michael@0 1673 store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
michael@0 1674 }
michael@0 1675 }
michael@0 1676
michael@0 1677 _mm_empty ();
michael@0 1678 }
michael@0 1679
michael@0 1680 static void
michael@0 1681 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
michael@0 1682 pixman_composite_info_t *info)
michael@0 1683 {
michael@0 1684 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1685 uint32_t *dst_line, *dst;
michael@0 1686 uint32_t *src_line, *src;
michael@0 1687 uint32_t mask;
michael@0 1688 __m64 vmask;
michael@0 1689 int dst_stride, src_stride;
michael@0 1690 int32_t w;
michael@0 1691 __m64 srca;
michael@0 1692
michael@0 1693 CHECKPOINT ();
michael@0 1694
michael@0 1695 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 1696 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 1697 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
michael@0 1698
michael@0 1699 vmask = expand_alpha (load8888 (&mask));
michael@0 1700 srca = MC (4x00ff);
michael@0 1701
michael@0 1702 while (height--)
michael@0 1703 {
michael@0 1704 dst = dst_line;
michael@0 1705 dst_line += dst_stride;
michael@0 1706 src = src_line;
michael@0 1707 src_line += src_stride;
michael@0 1708 w = width;
michael@0 1709
michael@0 1710 while (w && (uintptr_t)dst & 7)
michael@0 1711 {
michael@0 1712 uint32_t ssrc = *src | 0xff000000;
michael@0 1713 __m64 s = load8888 (&ssrc);
michael@0 1714 __m64 d = load8888 (dst);
michael@0 1715
michael@0 1716 store8888 (dst, in_over (s, srca, vmask, d));
michael@0 1717
michael@0 1718 w--;
michael@0 1719 dst++;
michael@0 1720 src++;
michael@0 1721 }
michael@0 1722
michael@0 1723 while (w >= 16)
michael@0 1724 {
michael@0 1725 __m64 vd0 = *(__m64 *)(dst + 0);
michael@0 1726 __m64 vd1 = *(__m64 *)(dst + 2);
michael@0 1727 __m64 vd2 = *(__m64 *)(dst + 4);
michael@0 1728 __m64 vd3 = *(__m64 *)(dst + 6);
michael@0 1729 __m64 vd4 = *(__m64 *)(dst + 8);
michael@0 1730 __m64 vd5 = *(__m64 *)(dst + 10);
michael@0 1731 __m64 vd6 = *(__m64 *)(dst + 12);
michael@0 1732 __m64 vd7 = *(__m64 *)(dst + 14);
michael@0 1733
michael@0 1734 __m64 vs0 = ldq_u ((__m64 *)(src + 0));
michael@0 1735 __m64 vs1 = ldq_u ((__m64 *)(src + 2));
michael@0 1736 __m64 vs2 = ldq_u ((__m64 *)(src + 4));
michael@0 1737 __m64 vs3 = ldq_u ((__m64 *)(src + 6));
michael@0 1738 __m64 vs4 = ldq_u ((__m64 *)(src + 8));
michael@0 1739 __m64 vs5 = ldq_u ((__m64 *)(src + 10));
michael@0 1740 __m64 vs6 = ldq_u ((__m64 *)(src + 12));
michael@0 1741 __m64 vs7 = ldq_u ((__m64 *)(src + 14));
michael@0 1742
michael@0 1743 vd0 = pack8888 (
michael@0 1744 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
michael@0 1745 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
michael@0 1746
michael@0 1747 vd1 = pack8888 (
michael@0 1748 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
michael@0 1749 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
michael@0 1750
michael@0 1751 vd2 = pack8888 (
michael@0 1752 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
michael@0 1753 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
michael@0 1754
michael@0 1755 vd3 = pack8888 (
michael@0 1756 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
michael@0 1757 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
michael@0 1758
michael@0 1759 vd4 = pack8888 (
michael@0 1760 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
michael@0 1761 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
michael@0 1762
michael@0 1763 vd5 = pack8888 (
michael@0 1764 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
michael@0 1765 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
michael@0 1766
michael@0 1767 vd6 = pack8888 (
michael@0 1768 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
michael@0 1769 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
michael@0 1770
michael@0 1771 vd7 = pack8888 (
michael@0 1772 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
michael@0 1773 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
michael@0 1774
michael@0 1775 *(__m64 *)(dst + 0) = vd0;
michael@0 1776 *(__m64 *)(dst + 2) = vd1;
michael@0 1777 *(__m64 *)(dst + 4) = vd2;
michael@0 1778 *(__m64 *)(dst + 6) = vd3;
michael@0 1779 *(__m64 *)(dst + 8) = vd4;
michael@0 1780 *(__m64 *)(dst + 10) = vd5;
michael@0 1781 *(__m64 *)(dst + 12) = vd6;
michael@0 1782 *(__m64 *)(dst + 14) = vd7;
michael@0 1783
michael@0 1784 w -= 16;
michael@0 1785 dst += 16;
michael@0 1786 src += 16;
michael@0 1787 }
michael@0 1788
michael@0 1789 while (w)
michael@0 1790 {
michael@0 1791 uint32_t ssrc = *src | 0xff000000;
michael@0 1792 __m64 s = load8888 (&ssrc);
michael@0 1793 __m64 d = load8888 (dst);
michael@0 1794
michael@0 1795 store8888 (dst, in_over (s, srca, vmask, d));
michael@0 1796
michael@0 1797 w--;
michael@0 1798 dst++;
michael@0 1799 src++;
michael@0 1800 }
michael@0 1801 }
michael@0 1802
michael@0 1803 _mm_empty ();
michael@0 1804 }
michael@0 1805
michael@0 1806 static void
michael@0 1807 mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
michael@0 1808 pixman_composite_info_t *info)
michael@0 1809 {
michael@0 1810 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1811 uint32_t *dst_line, *dst;
michael@0 1812 uint32_t *src_line, *src;
michael@0 1813 uint32_t s;
michael@0 1814 int dst_stride, src_stride;
michael@0 1815 uint8_t a;
michael@0 1816 int32_t w;
michael@0 1817
michael@0 1818 CHECKPOINT ();
michael@0 1819
michael@0 1820 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 1821 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 1822
michael@0 1823 while (height--)
michael@0 1824 {
michael@0 1825 dst = dst_line;
michael@0 1826 dst_line += dst_stride;
michael@0 1827 src = src_line;
michael@0 1828 src_line += src_stride;
michael@0 1829 w = width;
michael@0 1830
michael@0 1831 while (w--)
michael@0 1832 {
michael@0 1833 s = *src++;
michael@0 1834 a = s >> 24;
michael@0 1835
michael@0 1836 if (a == 0xff)
michael@0 1837 {
michael@0 1838 *dst = s;
michael@0 1839 }
michael@0 1840 else if (s)
michael@0 1841 {
michael@0 1842 __m64 ms, sa;
michael@0 1843 ms = load8888 (&s);
michael@0 1844 sa = expand_alpha (ms);
michael@0 1845 store8888 (dst, over (ms, sa, load8888 (dst)));
michael@0 1846 }
michael@0 1847
michael@0 1848 dst++;
michael@0 1849 }
michael@0 1850 }
michael@0 1851 _mm_empty ();
michael@0 1852 }
michael@0 1853
michael@0 1854 static void
michael@0 1855 mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
michael@0 1856 pixman_composite_info_t *info)
michael@0 1857 {
michael@0 1858 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1859 uint16_t *dst_line, *dst;
michael@0 1860 uint32_t *src_line, *src;
michael@0 1861 int dst_stride, src_stride;
michael@0 1862 int32_t w;
michael@0 1863
michael@0 1864 CHECKPOINT ();
michael@0 1865
michael@0 1866 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 1867 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 1868
michael@0 1869 #if 0
michael@0 1870 /* FIXME */
michael@0 1871 assert (src_image->drawable == mask_image->drawable);
michael@0 1872 #endif
michael@0 1873
michael@0 1874 while (height--)
michael@0 1875 {
michael@0 1876 dst = dst_line;
michael@0 1877 dst_line += dst_stride;
michael@0 1878 src = src_line;
michael@0 1879 src_line += src_stride;
michael@0 1880 w = width;
michael@0 1881
michael@0 1882 CHECKPOINT ();
michael@0 1883
michael@0 1884 while (w && (uintptr_t)dst & 7)
michael@0 1885 {
michael@0 1886 __m64 vsrc = load8888 (src);
michael@0 1887 uint64_t d = *dst;
michael@0 1888 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 1889
michael@0 1890 vdest = pack_565 (
michael@0 1891 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
michael@0 1892
michael@0 1893 *dst = to_uint64 (vdest);
michael@0 1894
michael@0 1895 w--;
michael@0 1896 dst++;
michael@0 1897 src++;
michael@0 1898 }
michael@0 1899
michael@0 1900 CHECKPOINT ();
michael@0 1901
michael@0 1902 while (w >= 4)
michael@0 1903 {
michael@0 1904 __m64 vdest = *(__m64 *)dst;
michael@0 1905 __m64 v0, v1, v2, v3;
michael@0 1906 __m64 vsrc0, vsrc1, vsrc2, vsrc3;
michael@0 1907
michael@0 1908 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
michael@0 1909
michael@0 1910 vsrc0 = load8888 ((src + 0));
michael@0 1911 vsrc1 = load8888 ((src + 1));
michael@0 1912 vsrc2 = load8888 ((src + 2));
michael@0 1913 vsrc3 = load8888 ((src + 3));
michael@0 1914
michael@0 1915 v0 = over (vsrc0, expand_alpha (vsrc0), v0);
michael@0 1916 v1 = over (vsrc1, expand_alpha (vsrc1), v1);
michael@0 1917 v2 = over (vsrc2, expand_alpha (vsrc2), v2);
michael@0 1918 v3 = over (vsrc3, expand_alpha (vsrc3), v3);
michael@0 1919
michael@0 1920 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
michael@0 1921
michael@0 1922 w -= 4;
michael@0 1923 dst += 4;
michael@0 1924 src += 4;
michael@0 1925 }
michael@0 1926
michael@0 1927 CHECKPOINT ();
michael@0 1928
michael@0 1929 while (w)
michael@0 1930 {
michael@0 1931 __m64 vsrc = load8888 (src);
michael@0 1932 uint64_t d = *dst;
michael@0 1933 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 1934
michael@0 1935 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
michael@0 1936
michael@0 1937 *dst = to_uint64 (vdest);
michael@0 1938
michael@0 1939 w--;
michael@0 1940 dst++;
michael@0 1941 src++;
michael@0 1942 }
michael@0 1943 }
michael@0 1944
michael@0 1945 _mm_empty ();
michael@0 1946 }
michael@0 1947
michael@0 1948 static void
michael@0 1949 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
michael@0 1950 pixman_composite_info_t *info)
michael@0 1951 {
michael@0 1952 PIXMAN_COMPOSITE_ARGS (info);
michael@0 1953 uint32_t src, srca;
michael@0 1954 uint32_t *dst_line, *dst;
michael@0 1955 uint8_t *mask_line, *mask;
michael@0 1956 int dst_stride, mask_stride;
michael@0 1957 int32_t w;
michael@0 1958 __m64 vsrc, vsrca;
michael@0 1959 uint64_t srcsrc;
michael@0 1960
michael@0 1961 CHECKPOINT ();
michael@0 1962
michael@0 1963 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 1964
michael@0 1965 srca = src >> 24;
michael@0 1966 if (src == 0)
michael@0 1967 return;
michael@0 1968
michael@0 1969 srcsrc = (uint64_t)src << 32 | src;
michael@0 1970
michael@0 1971 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 1972 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 1973
michael@0 1974 vsrc = load8888 (&src);
michael@0 1975 vsrca = expand_alpha (vsrc);
michael@0 1976
michael@0 1977 while (height--)
michael@0 1978 {
michael@0 1979 dst = dst_line;
michael@0 1980 dst_line += dst_stride;
michael@0 1981 mask = mask_line;
michael@0 1982 mask_line += mask_stride;
michael@0 1983 w = width;
michael@0 1984
michael@0 1985 CHECKPOINT ();
michael@0 1986
michael@0 1987 while (w && (uintptr_t)dst & 7)
michael@0 1988 {
michael@0 1989 uint64_t m = *mask;
michael@0 1990
michael@0 1991 if (m)
michael@0 1992 {
michael@0 1993 __m64 vdest = in_over (vsrc, vsrca,
michael@0 1994 expand_alpha_rev (to_m64 (m)),
michael@0 1995 load8888 (dst));
michael@0 1996
michael@0 1997 store8888 (dst, vdest);
michael@0 1998 }
michael@0 1999
michael@0 2000 w--;
michael@0 2001 mask++;
michael@0 2002 dst++;
michael@0 2003 }
michael@0 2004
michael@0 2005 CHECKPOINT ();
michael@0 2006
michael@0 2007 while (w >= 2)
michael@0 2008 {
michael@0 2009 uint64_t m0, m1;
michael@0 2010
michael@0 2011 m0 = *mask;
michael@0 2012 m1 = *(mask + 1);
michael@0 2013
michael@0 2014 if (srca == 0xff && (m0 & m1) == 0xff)
michael@0 2015 {
michael@0 2016 *(uint64_t *)dst = srcsrc;
michael@0 2017 }
michael@0 2018 else if (m0 | m1)
michael@0 2019 {
michael@0 2020 __m64 vdest;
michael@0 2021 __m64 dest0, dest1;
michael@0 2022
michael@0 2023 vdest = *(__m64 *)dst;
michael@0 2024
michael@0 2025 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
michael@0 2026 expand8888 (vdest, 0));
michael@0 2027 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
michael@0 2028 expand8888 (vdest, 1));
michael@0 2029
michael@0 2030 *(__m64 *)dst = pack8888 (dest0, dest1);
michael@0 2031 }
michael@0 2032
michael@0 2033 mask += 2;
michael@0 2034 dst += 2;
michael@0 2035 w -= 2;
michael@0 2036 }
michael@0 2037
michael@0 2038 CHECKPOINT ();
michael@0 2039
michael@0 2040 if (w)
michael@0 2041 {
michael@0 2042 uint64_t m = *mask;
michael@0 2043
michael@0 2044 if (m)
michael@0 2045 {
michael@0 2046 __m64 vdest = load8888 (dst);
michael@0 2047
michael@0 2048 vdest = in_over (
michael@0 2049 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
michael@0 2050 store8888 (dst, vdest);
michael@0 2051 }
michael@0 2052 }
michael@0 2053 }
michael@0 2054
michael@0 2055 _mm_empty ();
michael@0 2056 }
michael@0 2057
michael@0 2058 static pixman_bool_t
michael@0 2059 mmx_fill (pixman_implementation_t *imp,
michael@0 2060 uint32_t * bits,
michael@0 2061 int stride,
michael@0 2062 int bpp,
michael@0 2063 int x,
michael@0 2064 int y,
michael@0 2065 int width,
michael@0 2066 int height,
michael@0 2067 uint32_t filler)
michael@0 2068 {
michael@0 2069 uint64_t fill;
michael@0 2070 __m64 vfill;
michael@0 2071 uint32_t byte_width;
michael@0 2072 uint8_t *byte_line;
michael@0 2073
michael@0 2074 #if defined __GNUC__ && defined USE_X86_MMX
michael@0 2075 __m64 v1, v2, v3, v4, v5, v6, v7;
michael@0 2076 #endif
michael@0 2077
michael@0 2078 if (bpp != 16 && bpp != 32 && bpp != 8)
michael@0 2079 return FALSE;
michael@0 2080
michael@0 2081 if (bpp == 8)
michael@0 2082 {
michael@0 2083 stride = stride * (int) sizeof (uint32_t) / 1;
michael@0 2084 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
michael@0 2085 byte_width = width;
michael@0 2086 stride *= 1;
michael@0 2087 filler = (filler & 0xff) * 0x01010101;
michael@0 2088 }
michael@0 2089 else if (bpp == 16)
michael@0 2090 {
michael@0 2091 stride = stride * (int) sizeof (uint32_t) / 2;
michael@0 2092 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
michael@0 2093 byte_width = 2 * width;
michael@0 2094 stride *= 2;
michael@0 2095 filler = (filler & 0xffff) * 0x00010001;
michael@0 2096 }
michael@0 2097 else
michael@0 2098 {
michael@0 2099 stride = stride * (int) sizeof (uint32_t) / 4;
michael@0 2100 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
michael@0 2101 byte_width = 4 * width;
michael@0 2102 stride *= 4;
michael@0 2103 }
michael@0 2104
michael@0 2105 fill = ((uint64_t)filler << 32) | filler;
michael@0 2106 vfill = to_m64 (fill);
michael@0 2107
michael@0 2108 #if defined __GNUC__ && defined USE_X86_MMX
michael@0 2109 __asm__ (
michael@0 2110 "movq %7, %0\n"
michael@0 2111 "movq %7, %1\n"
michael@0 2112 "movq %7, %2\n"
michael@0 2113 "movq %7, %3\n"
michael@0 2114 "movq %7, %4\n"
michael@0 2115 "movq %7, %5\n"
michael@0 2116 "movq %7, %6\n"
michael@0 2117 : "=&y" (v1), "=&y" (v2), "=&y" (v3),
michael@0 2118 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
michael@0 2119 : "y" (vfill));
michael@0 2120 #endif
michael@0 2121
michael@0 2122 while (height--)
michael@0 2123 {
michael@0 2124 int w;
michael@0 2125 uint8_t *d = byte_line;
michael@0 2126
michael@0 2127 byte_line += stride;
michael@0 2128 w = byte_width;
michael@0 2129
michael@0 2130 if (w >= 1 && ((uintptr_t)d & 1))
michael@0 2131 {
michael@0 2132 *(uint8_t *)d = (filler & 0xff);
michael@0 2133 w--;
michael@0 2134 d++;
michael@0 2135 }
michael@0 2136
michael@0 2137 if (w >= 2 && ((uintptr_t)d & 3))
michael@0 2138 {
michael@0 2139 *(uint16_t *)d = filler;
michael@0 2140 w -= 2;
michael@0 2141 d += 2;
michael@0 2142 }
michael@0 2143
michael@0 2144 while (w >= 4 && ((uintptr_t)d & 7))
michael@0 2145 {
michael@0 2146 *(uint32_t *)d = filler;
michael@0 2147
michael@0 2148 w -= 4;
michael@0 2149 d += 4;
michael@0 2150 }
michael@0 2151
michael@0 2152 while (w >= 64)
michael@0 2153 {
michael@0 2154 #if defined __GNUC__ && defined USE_X86_MMX
michael@0 2155 __asm__ (
michael@0 2156 "movq %1, (%0)\n"
michael@0 2157 "movq %2, 8(%0)\n"
michael@0 2158 "movq %3, 16(%0)\n"
michael@0 2159 "movq %4, 24(%0)\n"
michael@0 2160 "movq %5, 32(%0)\n"
michael@0 2161 "movq %6, 40(%0)\n"
michael@0 2162 "movq %7, 48(%0)\n"
michael@0 2163 "movq %8, 56(%0)\n"
michael@0 2164 :
michael@0 2165 : "r" (d),
michael@0 2166 "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
michael@0 2167 "y" (v4), "y" (v5), "y" (v6), "y" (v7)
michael@0 2168 : "memory");
michael@0 2169 #else
michael@0 2170 *(__m64*) (d + 0) = vfill;
michael@0 2171 *(__m64*) (d + 8) = vfill;
michael@0 2172 *(__m64*) (d + 16) = vfill;
michael@0 2173 *(__m64*) (d + 24) = vfill;
michael@0 2174 *(__m64*) (d + 32) = vfill;
michael@0 2175 *(__m64*) (d + 40) = vfill;
michael@0 2176 *(__m64*) (d + 48) = vfill;
michael@0 2177 *(__m64*) (d + 56) = vfill;
michael@0 2178 #endif
michael@0 2179 w -= 64;
michael@0 2180 d += 64;
michael@0 2181 }
michael@0 2182
michael@0 2183 while (w >= 4)
michael@0 2184 {
michael@0 2185 *(uint32_t *)d = filler;
michael@0 2186
michael@0 2187 w -= 4;
michael@0 2188 d += 4;
michael@0 2189 }
michael@0 2190 if (w >= 2)
michael@0 2191 {
michael@0 2192 *(uint16_t *)d = filler;
michael@0 2193 w -= 2;
michael@0 2194 d += 2;
michael@0 2195 }
michael@0 2196 if (w >= 1)
michael@0 2197 {
michael@0 2198 *(uint8_t *)d = (filler & 0xff);
michael@0 2199 w--;
michael@0 2200 d++;
michael@0 2201 }
michael@0 2202
michael@0 2203 }
michael@0 2204
michael@0 2205 _mm_empty ();
michael@0 2206 return TRUE;
michael@0 2207 }
michael@0 2208
michael@0 2209 static void
michael@0 2210 mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
michael@0 2211 pixman_composite_info_t *info)
michael@0 2212 {
michael@0 2213 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2214 uint16_t *dst_line, *dst;
michael@0 2215 uint32_t *src_line, *src, s;
michael@0 2216 int dst_stride, src_stride;
michael@0 2217 int32_t w;
michael@0 2218
michael@0 2219 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2220 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 2221
michael@0 2222 while (height--)
michael@0 2223 {
michael@0 2224 dst = dst_line;
michael@0 2225 dst_line += dst_stride;
michael@0 2226 src = src_line;
michael@0 2227 src_line += src_stride;
michael@0 2228 w = width;
michael@0 2229
michael@0 2230 while (w && (uintptr_t)dst & 7)
michael@0 2231 {
michael@0 2232 s = *src++;
michael@0 2233 *dst = convert_8888_to_0565 (s);
michael@0 2234 dst++;
michael@0 2235 w--;
michael@0 2236 }
michael@0 2237
michael@0 2238 while (w >= 4)
michael@0 2239 {
michael@0 2240 __m64 vdest;
michael@0 2241 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
michael@0 2242 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
michael@0 2243
michael@0 2244 vdest = pack_4xpacked565 (vsrc0, vsrc1);
michael@0 2245
michael@0 2246 *(__m64 *)dst = vdest;
michael@0 2247
michael@0 2248 w -= 4;
michael@0 2249 src += 4;
michael@0 2250 dst += 4;
michael@0 2251 }
michael@0 2252
michael@0 2253 while (w)
michael@0 2254 {
michael@0 2255 s = *src++;
michael@0 2256 *dst = convert_8888_to_0565 (s);
michael@0 2257 dst++;
michael@0 2258 w--;
michael@0 2259 }
michael@0 2260 }
michael@0 2261
michael@0 2262 _mm_empty ();
michael@0 2263 }
michael@0 2264
michael@0 2265 static void
michael@0 2266 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
michael@0 2267 pixman_composite_info_t *info)
michael@0 2268 {
michael@0 2269 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2270 uint32_t src, srca;
michael@0 2271 uint32_t *dst_line, *dst;
michael@0 2272 uint8_t *mask_line, *mask;
michael@0 2273 int dst_stride, mask_stride;
michael@0 2274 int32_t w;
michael@0 2275 __m64 vsrc;
michael@0 2276 uint64_t srcsrc;
michael@0 2277
michael@0 2278 CHECKPOINT ();
michael@0 2279
michael@0 2280 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2281
michael@0 2282 srca = src >> 24;
michael@0 2283 if (src == 0)
michael@0 2284 {
michael@0 2285 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
michael@0 2286 PIXMAN_FORMAT_BPP (dest_image->bits.format),
michael@0 2287 dest_x, dest_y, width, height, 0);
michael@0 2288 return;
michael@0 2289 }
michael@0 2290
michael@0 2291 srcsrc = (uint64_t)src << 32 | src;
michael@0 2292
michael@0 2293 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2294 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 2295
michael@0 2296 vsrc = load8888 (&src);
michael@0 2297
michael@0 2298 while (height--)
michael@0 2299 {
michael@0 2300 dst = dst_line;
michael@0 2301 dst_line += dst_stride;
michael@0 2302 mask = mask_line;
michael@0 2303 mask_line += mask_stride;
michael@0 2304 w = width;
michael@0 2305
michael@0 2306 CHECKPOINT ();
michael@0 2307
michael@0 2308 while (w && (uintptr_t)dst & 7)
michael@0 2309 {
michael@0 2310 uint64_t m = *mask;
michael@0 2311
michael@0 2312 if (m)
michael@0 2313 {
michael@0 2314 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
michael@0 2315
michael@0 2316 store8888 (dst, vdest);
michael@0 2317 }
michael@0 2318 else
michael@0 2319 {
michael@0 2320 *dst = 0;
michael@0 2321 }
michael@0 2322
michael@0 2323 w--;
michael@0 2324 mask++;
michael@0 2325 dst++;
michael@0 2326 }
michael@0 2327
michael@0 2328 CHECKPOINT ();
michael@0 2329
michael@0 2330 while (w >= 2)
michael@0 2331 {
michael@0 2332 uint64_t m0, m1;
michael@0 2333 m0 = *mask;
michael@0 2334 m1 = *(mask + 1);
michael@0 2335
michael@0 2336 if (srca == 0xff && (m0 & m1) == 0xff)
michael@0 2337 {
michael@0 2338 *(uint64_t *)dst = srcsrc;
michael@0 2339 }
michael@0 2340 else if (m0 | m1)
michael@0 2341 {
michael@0 2342 __m64 dest0, dest1;
michael@0 2343
michael@0 2344 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
michael@0 2345 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
michael@0 2346
michael@0 2347 *(__m64 *)dst = pack8888 (dest0, dest1);
michael@0 2348 }
michael@0 2349 else
michael@0 2350 {
michael@0 2351 *(uint64_t *)dst = 0;
michael@0 2352 }
michael@0 2353
michael@0 2354 mask += 2;
michael@0 2355 dst += 2;
michael@0 2356 w -= 2;
michael@0 2357 }
michael@0 2358
michael@0 2359 CHECKPOINT ();
michael@0 2360
michael@0 2361 if (w)
michael@0 2362 {
michael@0 2363 uint64_t m = *mask;
michael@0 2364
michael@0 2365 if (m)
michael@0 2366 {
michael@0 2367 __m64 vdest = load8888 (dst);
michael@0 2368
michael@0 2369 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
michael@0 2370 store8888 (dst, vdest);
michael@0 2371 }
michael@0 2372 else
michael@0 2373 {
michael@0 2374 *dst = 0;
michael@0 2375 }
michael@0 2376 }
michael@0 2377 }
michael@0 2378
michael@0 2379 _mm_empty ();
michael@0 2380 }
michael@0 2381
michael@0 2382 static void
michael@0 2383 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
michael@0 2384 pixman_composite_info_t *info)
michael@0 2385 {
michael@0 2386 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2387 uint32_t src, srca;
michael@0 2388 uint16_t *dst_line, *dst;
michael@0 2389 uint8_t *mask_line, *mask;
michael@0 2390 int dst_stride, mask_stride;
michael@0 2391 int32_t w;
michael@0 2392 __m64 vsrc, vsrca, tmp;
michael@0 2393 __m64 srcsrcsrcsrc;
michael@0 2394
michael@0 2395 CHECKPOINT ();
michael@0 2396
michael@0 2397 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2398
michael@0 2399 srca = src >> 24;
michael@0 2400 if (src == 0)
michael@0 2401 return;
michael@0 2402
michael@0 2403 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 2404 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 2405
michael@0 2406 vsrc = load8888 (&src);
michael@0 2407 vsrca = expand_alpha (vsrc);
michael@0 2408
michael@0 2409 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
michael@0 2410 srcsrcsrcsrc = expand_alpha_rev (tmp);
michael@0 2411
michael@0 2412 while (height--)
michael@0 2413 {
michael@0 2414 dst = dst_line;
michael@0 2415 dst_line += dst_stride;
michael@0 2416 mask = mask_line;
michael@0 2417 mask_line += mask_stride;
michael@0 2418 w = width;
michael@0 2419
michael@0 2420 CHECKPOINT ();
michael@0 2421
michael@0 2422 while (w && (uintptr_t)dst & 7)
michael@0 2423 {
michael@0 2424 uint64_t m = *mask;
michael@0 2425
michael@0 2426 if (m)
michael@0 2427 {
michael@0 2428 uint64_t d = *dst;
michael@0 2429 __m64 vd = to_m64 (d);
michael@0 2430 __m64 vdest = in_over (
michael@0 2431 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
michael@0 2432
michael@0 2433 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
michael@0 2434 *dst = to_uint64 (vd);
michael@0 2435 }
michael@0 2436
michael@0 2437 w--;
michael@0 2438 mask++;
michael@0 2439 dst++;
michael@0 2440 }
michael@0 2441
michael@0 2442 CHECKPOINT ();
michael@0 2443
michael@0 2444 while (w >= 4)
michael@0 2445 {
michael@0 2446 uint64_t m0, m1, m2, m3;
michael@0 2447 m0 = *mask;
michael@0 2448 m1 = *(mask + 1);
michael@0 2449 m2 = *(mask + 2);
michael@0 2450 m3 = *(mask + 3);
michael@0 2451
michael@0 2452 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
michael@0 2453 {
michael@0 2454 *(__m64 *)dst = srcsrcsrcsrc;
michael@0 2455 }
michael@0 2456 else if (m0 | m1 | m2 | m3)
michael@0 2457 {
michael@0 2458 __m64 vdest = *(__m64 *)dst;
michael@0 2459 __m64 v0, v1, v2, v3;
michael@0 2460 __m64 vm0, vm1, vm2, vm3;
michael@0 2461
michael@0 2462 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
michael@0 2463
michael@0 2464 vm0 = to_m64 (m0);
michael@0 2465 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
michael@0 2466
michael@0 2467 vm1 = to_m64 (m1);
michael@0 2468 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
michael@0 2469
michael@0 2470 vm2 = to_m64 (m2);
michael@0 2471 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
michael@0 2472
michael@0 2473 vm3 = to_m64 (m3);
michael@0 2474 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
michael@0 2475
michael@0 2476 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
michael@0 2477 }
michael@0 2478
michael@0 2479 w -= 4;
michael@0 2480 mask += 4;
michael@0 2481 dst += 4;
michael@0 2482 }
michael@0 2483
michael@0 2484 CHECKPOINT ();
michael@0 2485
michael@0 2486 while (w)
michael@0 2487 {
michael@0 2488 uint64_t m = *mask;
michael@0 2489
michael@0 2490 if (m)
michael@0 2491 {
michael@0 2492 uint64_t d = *dst;
michael@0 2493 __m64 vd = to_m64 (d);
michael@0 2494 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
michael@0 2495 expand565 (vd, 0));
michael@0 2496 vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
michael@0 2497 *dst = to_uint64 (vd);
michael@0 2498 }
michael@0 2499
michael@0 2500 w--;
michael@0 2501 mask++;
michael@0 2502 dst++;
michael@0 2503 }
michael@0 2504 }
michael@0 2505
michael@0 2506 _mm_empty ();
michael@0 2507 }
michael@0 2508
michael@0 2509 static void
michael@0 2510 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
michael@0 2511 pixman_composite_info_t *info)
michael@0 2512 {
michael@0 2513 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2514 uint16_t *dst_line, *dst;
michael@0 2515 uint32_t *src_line, *src;
michael@0 2516 int dst_stride, src_stride;
michael@0 2517 int32_t w;
michael@0 2518
michael@0 2519 CHECKPOINT ();
michael@0 2520
michael@0 2521 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 2522 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2523
michael@0 2524 #if 0
michael@0 2525 /* FIXME */
michael@0 2526 assert (src_image->drawable == mask_image->drawable);
michael@0 2527 #endif
michael@0 2528
michael@0 2529 while (height--)
michael@0 2530 {
michael@0 2531 dst = dst_line;
michael@0 2532 dst_line += dst_stride;
michael@0 2533 src = src_line;
michael@0 2534 src_line += src_stride;
michael@0 2535 w = width;
michael@0 2536
michael@0 2537 CHECKPOINT ();
michael@0 2538
michael@0 2539 while (w && (uintptr_t)dst & 7)
michael@0 2540 {
michael@0 2541 __m64 vsrc = load8888 (src);
michael@0 2542 uint64_t d = *dst;
michael@0 2543 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 2544
michael@0 2545 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
michael@0 2546
michael@0 2547 *dst = to_uint64 (vdest);
michael@0 2548
michael@0 2549 w--;
michael@0 2550 dst++;
michael@0 2551 src++;
michael@0 2552 }
michael@0 2553
michael@0 2554 CHECKPOINT ();
michael@0 2555
michael@0 2556 while (w >= 4)
michael@0 2557 {
michael@0 2558 uint32_t s0, s1, s2, s3;
michael@0 2559 unsigned char a0, a1, a2, a3;
michael@0 2560
michael@0 2561 s0 = *src;
michael@0 2562 s1 = *(src + 1);
michael@0 2563 s2 = *(src + 2);
michael@0 2564 s3 = *(src + 3);
michael@0 2565
michael@0 2566 a0 = (s0 >> 24);
michael@0 2567 a1 = (s1 >> 24);
michael@0 2568 a2 = (s2 >> 24);
michael@0 2569 a3 = (s3 >> 24);
michael@0 2570
michael@0 2571 if ((a0 & a1 & a2 & a3) == 0xFF)
michael@0 2572 {
michael@0 2573 __m64 v0 = invert_colors (load8888 (&s0));
michael@0 2574 __m64 v1 = invert_colors (load8888 (&s1));
michael@0 2575 __m64 v2 = invert_colors (load8888 (&s2));
michael@0 2576 __m64 v3 = invert_colors (load8888 (&s3));
michael@0 2577
michael@0 2578 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
michael@0 2579 }
michael@0 2580 else if (s0 | s1 | s2 | s3)
michael@0 2581 {
michael@0 2582 __m64 vdest = *(__m64 *)dst;
michael@0 2583 __m64 v0, v1, v2, v3;
michael@0 2584
michael@0 2585 __m64 vsrc0 = load8888 (&s0);
michael@0 2586 __m64 vsrc1 = load8888 (&s1);
michael@0 2587 __m64 vsrc2 = load8888 (&s2);
michael@0 2588 __m64 vsrc3 = load8888 (&s3);
michael@0 2589
michael@0 2590 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
michael@0 2591
michael@0 2592 v0 = over_rev_non_pre (vsrc0, v0);
michael@0 2593 v1 = over_rev_non_pre (vsrc1, v1);
michael@0 2594 v2 = over_rev_non_pre (vsrc2, v2);
michael@0 2595 v3 = over_rev_non_pre (vsrc3, v3);
michael@0 2596
michael@0 2597 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
michael@0 2598 }
michael@0 2599
michael@0 2600 w -= 4;
michael@0 2601 dst += 4;
michael@0 2602 src += 4;
michael@0 2603 }
michael@0 2604
michael@0 2605 CHECKPOINT ();
michael@0 2606
michael@0 2607 while (w)
michael@0 2608 {
michael@0 2609 __m64 vsrc = load8888 (src);
michael@0 2610 uint64_t d = *dst;
michael@0 2611 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 2612
michael@0 2613 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
michael@0 2614
michael@0 2615 *dst = to_uint64 (vdest);
michael@0 2616
michael@0 2617 w--;
michael@0 2618 dst++;
michael@0 2619 src++;
michael@0 2620 }
michael@0 2621 }
michael@0 2622
michael@0 2623 _mm_empty ();
michael@0 2624 }
michael@0 2625
michael@0 2626 static void
michael@0 2627 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
michael@0 2628 pixman_composite_info_t *info)
michael@0 2629 {
michael@0 2630 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2631 uint32_t *dst_line, *dst;
michael@0 2632 uint32_t *src_line, *src;
michael@0 2633 int dst_stride, src_stride;
michael@0 2634 int32_t w;
michael@0 2635
michael@0 2636 CHECKPOINT ();
michael@0 2637
michael@0 2638 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2639 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2640
michael@0 2641 #if 0
michael@0 2642 /* FIXME */
michael@0 2643 assert (src_image->drawable == mask_image->drawable);
michael@0 2644 #endif
michael@0 2645
michael@0 2646 while (height--)
michael@0 2647 {
michael@0 2648 dst = dst_line;
michael@0 2649 dst_line += dst_stride;
michael@0 2650 src = src_line;
michael@0 2651 src_line += src_stride;
michael@0 2652 w = width;
michael@0 2653
michael@0 2654 while (w && (uintptr_t)dst & 7)
michael@0 2655 {
michael@0 2656 __m64 s = load8888 (src);
michael@0 2657 __m64 d = load8888 (dst);
michael@0 2658
michael@0 2659 store8888 (dst, over_rev_non_pre (s, d));
michael@0 2660
michael@0 2661 w--;
michael@0 2662 dst++;
michael@0 2663 src++;
michael@0 2664 }
michael@0 2665
michael@0 2666 while (w >= 2)
michael@0 2667 {
michael@0 2668 uint32_t s0, s1;
michael@0 2669 unsigned char a0, a1;
michael@0 2670 __m64 d0, d1;
michael@0 2671
michael@0 2672 s0 = *src;
michael@0 2673 s1 = *(src + 1);
michael@0 2674
michael@0 2675 a0 = (s0 >> 24);
michael@0 2676 a1 = (s1 >> 24);
michael@0 2677
michael@0 2678 if ((a0 & a1) == 0xFF)
michael@0 2679 {
michael@0 2680 d0 = invert_colors (load8888 (&s0));
michael@0 2681 d1 = invert_colors (load8888 (&s1));
michael@0 2682
michael@0 2683 *(__m64 *)dst = pack8888 (d0, d1);
michael@0 2684 }
michael@0 2685 else if (s0 | s1)
michael@0 2686 {
michael@0 2687 __m64 vdest = *(__m64 *)dst;
michael@0 2688
michael@0 2689 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
michael@0 2690 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
michael@0 2691
michael@0 2692 *(__m64 *)dst = pack8888 (d0, d1);
michael@0 2693 }
michael@0 2694
michael@0 2695 w -= 2;
michael@0 2696 dst += 2;
michael@0 2697 src += 2;
michael@0 2698 }
michael@0 2699
michael@0 2700 if (w)
michael@0 2701 {
michael@0 2702 __m64 s = load8888 (src);
michael@0 2703 __m64 d = load8888 (dst);
michael@0 2704
michael@0 2705 store8888 (dst, over_rev_non_pre (s, d));
michael@0 2706 }
michael@0 2707 }
michael@0 2708
michael@0 2709 _mm_empty ();
michael@0 2710 }
michael@0 2711
michael@0 2712 static void
michael@0 2713 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
michael@0 2714 pixman_composite_info_t *info)
michael@0 2715 {
michael@0 2716 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2717 uint32_t src;
michael@0 2718 uint16_t *dst_line;
michael@0 2719 uint32_t *mask_line;
michael@0 2720 int dst_stride, mask_stride;
michael@0 2721 __m64 vsrc, vsrca;
michael@0 2722
michael@0 2723 CHECKPOINT ();
michael@0 2724
michael@0 2725 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2726
michael@0 2727 if (src == 0)
michael@0 2728 return;
michael@0 2729
michael@0 2730 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 2731 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
michael@0 2732
michael@0 2733 vsrc = load8888 (&src);
michael@0 2734 vsrca = expand_alpha (vsrc);
michael@0 2735
michael@0 2736 while (height--)
michael@0 2737 {
michael@0 2738 int twidth = width;
michael@0 2739 uint32_t *p = (uint32_t *)mask_line;
michael@0 2740 uint16_t *q = (uint16_t *)dst_line;
michael@0 2741
michael@0 2742 while (twidth && ((uintptr_t)q & 7))
michael@0 2743 {
michael@0 2744 uint32_t m = *(uint32_t *)p;
michael@0 2745
michael@0 2746 if (m)
michael@0 2747 {
michael@0 2748 uint64_t d = *q;
michael@0 2749 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 2750 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
michael@0 2751 *q = to_uint64 (vdest);
michael@0 2752 }
michael@0 2753
michael@0 2754 twidth--;
michael@0 2755 p++;
michael@0 2756 q++;
michael@0 2757 }
michael@0 2758
michael@0 2759 while (twidth >= 4)
michael@0 2760 {
michael@0 2761 uint32_t m0, m1, m2, m3;
michael@0 2762
michael@0 2763 m0 = *p;
michael@0 2764 m1 = *(p + 1);
michael@0 2765 m2 = *(p + 2);
michael@0 2766 m3 = *(p + 3);
michael@0 2767
michael@0 2768 if ((m0 | m1 | m2 | m3))
michael@0 2769 {
michael@0 2770 __m64 vdest = *(__m64 *)q;
michael@0 2771 __m64 v0, v1, v2, v3;
michael@0 2772
michael@0 2773 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
michael@0 2774
michael@0 2775 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
michael@0 2776 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
michael@0 2777 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
michael@0 2778 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
michael@0 2779
michael@0 2780 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
michael@0 2781 }
michael@0 2782 twidth -= 4;
michael@0 2783 p += 4;
michael@0 2784 q += 4;
michael@0 2785 }
michael@0 2786
michael@0 2787 while (twidth)
michael@0 2788 {
michael@0 2789 uint32_t m;
michael@0 2790
michael@0 2791 m = *(uint32_t *)p;
michael@0 2792 if (m)
michael@0 2793 {
michael@0 2794 uint64_t d = *q;
michael@0 2795 __m64 vdest = expand565 (to_m64 (d), 0);
michael@0 2796 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
michael@0 2797 *q = to_uint64 (vdest);
michael@0 2798 }
michael@0 2799
michael@0 2800 twidth--;
michael@0 2801 p++;
michael@0 2802 q++;
michael@0 2803 }
michael@0 2804
michael@0 2805 mask_line += mask_stride;
michael@0 2806 dst_line += dst_stride;
michael@0 2807 }
michael@0 2808
michael@0 2809 _mm_empty ();
michael@0 2810 }
michael@0 2811
michael@0 2812 static void
michael@0 2813 mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
michael@0 2814 pixman_composite_info_t *info)
michael@0 2815 {
michael@0 2816 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2817 uint8_t *dst_line, *dst;
michael@0 2818 uint8_t *mask_line, *mask;
michael@0 2819 int dst_stride, mask_stride;
michael@0 2820 int32_t w;
michael@0 2821 uint32_t src;
michael@0 2822 uint8_t sa;
michael@0 2823 __m64 vsrc, vsrca;
michael@0 2824
michael@0 2825 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 2826 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 2827
michael@0 2828 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2829
michael@0 2830 sa = src >> 24;
michael@0 2831
michael@0 2832 vsrc = load8888 (&src);
michael@0 2833 vsrca = expand_alpha (vsrc);
michael@0 2834
michael@0 2835 while (height--)
michael@0 2836 {
michael@0 2837 dst = dst_line;
michael@0 2838 dst_line += dst_stride;
michael@0 2839 mask = mask_line;
michael@0 2840 mask_line += mask_stride;
michael@0 2841 w = width;
michael@0 2842
michael@0 2843 while (w && (uintptr_t)dst & 7)
michael@0 2844 {
michael@0 2845 uint16_t tmp;
michael@0 2846 uint8_t a;
michael@0 2847 uint32_t m, d;
michael@0 2848
michael@0 2849 a = *mask++;
michael@0 2850 d = *dst;
michael@0 2851
michael@0 2852 m = MUL_UN8 (sa, a, tmp);
michael@0 2853 d = MUL_UN8 (m, d, tmp);
michael@0 2854
michael@0 2855 *dst++ = d;
michael@0 2856 w--;
michael@0 2857 }
michael@0 2858
michael@0 2859 while (w >= 4)
michael@0 2860 {
michael@0 2861 __m64 vmask;
michael@0 2862 __m64 vdest;
michael@0 2863
michael@0 2864 vmask = load8888u ((uint32_t *)mask);
michael@0 2865 vdest = load8888 ((uint32_t *)dst);
michael@0 2866
michael@0 2867 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
michael@0 2868
michael@0 2869 dst += 4;
michael@0 2870 mask += 4;
michael@0 2871 w -= 4;
michael@0 2872 }
michael@0 2873
michael@0 2874 while (w--)
michael@0 2875 {
michael@0 2876 uint16_t tmp;
michael@0 2877 uint8_t a;
michael@0 2878 uint32_t m, d;
michael@0 2879
michael@0 2880 a = *mask++;
michael@0 2881 d = *dst;
michael@0 2882
michael@0 2883 m = MUL_UN8 (sa, a, tmp);
michael@0 2884 d = MUL_UN8 (m, d, tmp);
michael@0 2885
michael@0 2886 *dst++ = d;
michael@0 2887 }
michael@0 2888 }
michael@0 2889
michael@0 2890 _mm_empty ();
michael@0 2891 }
michael@0 2892
michael@0 2893 static void
michael@0 2894 mmx_composite_in_8_8 (pixman_implementation_t *imp,
michael@0 2895 pixman_composite_info_t *info)
michael@0 2896 {
michael@0 2897 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2898 uint8_t *dst_line, *dst;
michael@0 2899 uint8_t *src_line, *src;
michael@0 2900 int src_stride, dst_stride;
michael@0 2901 int32_t w;
michael@0 2902
michael@0 2903 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 2904 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
michael@0 2905
michael@0 2906 while (height--)
michael@0 2907 {
michael@0 2908 dst = dst_line;
michael@0 2909 dst_line += dst_stride;
michael@0 2910 src = src_line;
michael@0 2911 src_line += src_stride;
michael@0 2912 w = width;
michael@0 2913
michael@0 2914 while (w && (uintptr_t)dst & 3)
michael@0 2915 {
michael@0 2916 uint8_t s, d;
michael@0 2917 uint16_t tmp;
michael@0 2918
michael@0 2919 s = *src;
michael@0 2920 d = *dst;
michael@0 2921
michael@0 2922 *dst = MUL_UN8 (s, d, tmp);
michael@0 2923
michael@0 2924 src++;
michael@0 2925 dst++;
michael@0 2926 w--;
michael@0 2927 }
michael@0 2928
michael@0 2929 while (w >= 4)
michael@0 2930 {
michael@0 2931 uint32_t *s = (uint32_t *)src;
michael@0 2932 uint32_t *d = (uint32_t *)dst;
michael@0 2933
michael@0 2934 store8888 (d, in (load8888u (s), load8888 (d)));
michael@0 2935
michael@0 2936 w -= 4;
michael@0 2937 dst += 4;
michael@0 2938 src += 4;
michael@0 2939 }
michael@0 2940
michael@0 2941 while (w--)
michael@0 2942 {
michael@0 2943 uint8_t s, d;
michael@0 2944 uint16_t tmp;
michael@0 2945
michael@0 2946 s = *src;
michael@0 2947 d = *dst;
michael@0 2948
michael@0 2949 *dst = MUL_UN8 (s, d, tmp);
michael@0 2950
michael@0 2951 src++;
michael@0 2952 dst++;
michael@0 2953 }
michael@0 2954 }
michael@0 2955
michael@0 2956 _mm_empty ();
michael@0 2957 }
michael@0 2958
michael@0 2959 static void
michael@0 2960 mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
michael@0 2961 pixman_composite_info_t *info)
michael@0 2962 {
michael@0 2963 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2964 uint8_t *dst_line, *dst;
michael@0 2965 uint8_t *mask_line, *mask;
michael@0 2966 int dst_stride, mask_stride;
michael@0 2967 int32_t w;
michael@0 2968 uint32_t src;
michael@0 2969 uint8_t sa;
michael@0 2970 __m64 vsrc, vsrca;
michael@0 2971
michael@0 2972 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 2973 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 2974
michael@0 2975 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2976
michael@0 2977 sa = src >> 24;
michael@0 2978
michael@0 2979 if (src == 0)
michael@0 2980 return;
michael@0 2981
michael@0 2982 vsrc = load8888 (&src);
michael@0 2983 vsrca = expand_alpha (vsrc);
michael@0 2984
michael@0 2985 while (height--)
michael@0 2986 {
michael@0 2987 dst = dst_line;
michael@0 2988 dst_line += dst_stride;
michael@0 2989 mask = mask_line;
michael@0 2990 mask_line += mask_stride;
michael@0 2991 w = width;
michael@0 2992
michael@0 2993 while (w && (uintptr_t)dst & 3)
michael@0 2994 {
michael@0 2995 uint16_t tmp;
michael@0 2996 uint16_t a;
michael@0 2997 uint32_t m, d;
michael@0 2998 uint32_t r;
michael@0 2999
michael@0 3000 a = *mask++;
michael@0 3001 d = *dst;
michael@0 3002
michael@0 3003 m = MUL_UN8 (sa, a, tmp);
michael@0 3004 r = ADD_UN8 (m, d, tmp);
michael@0 3005
michael@0 3006 *dst++ = r;
michael@0 3007 w--;
michael@0 3008 }
michael@0 3009
michael@0 3010 while (w >= 4)
michael@0 3011 {
michael@0 3012 __m64 vmask;
michael@0 3013 __m64 vdest;
michael@0 3014
michael@0 3015 vmask = load8888u ((uint32_t *)mask);
michael@0 3016 vdest = load8888 ((uint32_t *)dst);
michael@0 3017
michael@0 3018 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
michael@0 3019
michael@0 3020 dst += 4;
michael@0 3021 mask += 4;
michael@0 3022 w -= 4;
michael@0 3023 }
michael@0 3024
michael@0 3025 while (w--)
michael@0 3026 {
michael@0 3027 uint16_t tmp;
michael@0 3028 uint16_t a;
michael@0 3029 uint32_t m, d;
michael@0 3030 uint32_t r;
michael@0 3031
michael@0 3032 a = *mask++;
michael@0 3033 d = *dst;
michael@0 3034
michael@0 3035 m = MUL_UN8 (sa, a, tmp);
michael@0 3036 r = ADD_UN8 (m, d, tmp);
michael@0 3037
michael@0 3038 *dst++ = r;
michael@0 3039 }
michael@0 3040 }
michael@0 3041
michael@0 3042 _mm_empty ();
michael@0 3043 }
michael@0 3044
michael@0 3045 static void
michael@0 3046 mmx_composite_add_8_8 (pixman_implementation_t *imp,
michael@0 3047 pixman_composite_info_t *info)
michael@0 3048 {
michael@0 3049 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3050 uint8_t *dst_line, *dst;
michael@0 3051 uint8_t *src_line, *src;
michael@0 3052 int dst_stride, src_stride;
michael@0 3053 int32_t w;
michael@0 3054 uint8_t s, d;
michael@0 3055 uint16_t t;
michael@0 3056
michael@0 3057 CHECKPOINT ();
michael@0 3058
michael@0 3059 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
michael@0 3060 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 3061
michael@0 3062 while (height--)
michael@0 3063 {
michael@0 3064 dst = dst_line;
michael@0 3065 dst_line += dst_stride;
michael@0 3066 src = src_line;
michael@0 3067 src_line += src_stride;
michael@0 3068 w = width;
michael@0 3069
michael@0 3070 while (w && (uintptr_t)dst & 7)
michael@0 3071 {
michael@0 3072 s = *src;
michael@0 3073 d = *dst;
michael@0 3074 t = d + s;
michael@0 3075 s = t | (0 - (t >> 8));
michael@0 3076 *dst = s;
michael@0 3077
michael@0 3078 dst++;
michael@0 3079 src++;
michael@0 3080 w--;
michael@0 3081 }
michael@0 3082
michael@0 3083 while (w >= 8)
michael@0 3084 {
michael@0 3085 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
michael@0 3086 dst += 8;
michael@0 3087 src += 8;
michael@0 3088 w -= 8;
michael@0 3089 }
michael@0 3090
michael@0 3091 while (w)
michael@0 3092 {
michael@0 3093 s = *src;
michael@0 3094 d = *dst;
michael@0 3095 t = d + s;
michael@0 3096 s = t | (0 - (t >> 8));
michael@0 3097 *dst = s;
michael@0 3098
michael@0 3099 dst++;
michael@0 3100 src++;
michael@0 3101 w--;
michael@0 3102 }
michael@0 3103 }
michael@0 3104
michael@0 3105 _mm_empty ();
michael@0 3106 }
michael@0 3107
michael@0 3108 static void
michael@0 3109 mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
michael@0 3110 pixman_composite_info_t *info)
michael@0 3111 {
michael@0 3112 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3113 uint16_t *dst_line, *dst;
michael@0 3114 uint32_t d;
michael@0 3115 uint16_t *src_line, *src;
michael@0 3116 uint32_t s;
michael@0 3117 int dst_stride, src_stride;
michael@0 3118 int32_t w;
michael@0 3119
michael@0 3120 CHECKPOINT ();
michael@0 3121
michael@0 3122 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
michael@0 3123 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 3124
michael@0 3125 while (height--)
michael@0 3126 {
michael@0 3127 dst = dst_line;
michael@0 3128 dst_line += dst_stride;
michael@0 3129 src = src_line;
michael@0 3130 src_line += src_stride;
michael@0 3131 w = width;
michael@0 3132
michael@0 3133 while (w && (uintptr_t)dst & 7)
michael@0 3134 {
michael@0 3135 s = *src++;
michael@0 3136 if (s)
michael@0 3137 {
michael@0 3138 d = *dst;
michael@0 3139 s = convert_0565_to_8888 (s);
michael@0 3140 if (d)
michael@0 3141 {
michael@0 3142 d = convert_0565_to_8888 (d);
michael@0 3143 UN8x4_ADD_UN8x4 (s, d);
michael@0 3144 }
michael@0 3145 *dst = convert_8888_to_0565 (s);
michael@0 3146 }
michael@0 3147 dst++;
michael@0 3148 w--;
michael@0 3149 }
michael@0 3150
michael@0 3151 while (w >= 4)
michael@0 3152 {
michael@0 3153 __m64 vdest = *(__m64 *)dst;
michael@0 3154 __m64 vsrc = ldq_u ((__m64 *)src);
michael@0 3155 __m64 vd0, vd1;
michael@0 3156 __m64 vs0, vs1;
michael@0 3157
michael@0 3158 expand_4xpacked565 (vdest, &vd0, &vd1, 0);
michael@0 3159 expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
michael@0 3160
michael@0 3161 vd0 = _mm_adds_pu8 (vd0, vs0);
michael@0 3162 vd1 = _mm_adds_pu8 (vd1, vs1);
michael@0 3163
michael@0 3164 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
michael@0 3165
michael@0 3166 dst += 4;
michael@0 3167 src += 4;
michael@0 3168 w -= 4;
michael@0 3169 }
michael@0 3170
michael@0 3171 while (w--)
michael@0 3172 {
michael@0 3173 s = *src++;
michael@0 3174 if (s)
michael@0 3175 {
michael@0 3176 d = *dst;
michael@0 3177 s = convert_0565_to_8888 (s);
michael@0 3178 if (d)
michael@0 3179 {
michael@0 3180 d = convert_0565_to_8888 (d);
michael@0 3181 UN8x4_ADD_UN8x4 (s, d);
michael@0 3182 }
michael@0 3183 *dst = convert_8888_to_0565 (s);
michael@0 3184 }
michael@0 3185 dst++;
michael@0 3186 }
michael@0 3187 }
michael@0 3188
michael@0 3189 _mm_empty ();
michael@0 3190 }
michael@0 3191
michael@0 3192 static void
michael@0 3193 mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
michael@0 3194 pixman_composite_info_t *info)
michael@0 3195 {
michael@0 3196 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3197 uint32_t *dst_line, *dst;
michael@0 3198 uint32_t *src_line, *src;
michael@0 3199 int dst_stride, src_stride;
michael@0 3200 int32_t w;
michael@0 3201
michael@0 3202 CHECKPOINT ();
michael@0 3203
michael@0 3204 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 3205 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3206
michael@0 3207 while (height--)
michael@0 3208 {
michael@0 3209 dst = dst_line;
michael@0 3210 dst_line += dst_stride;
michael@0 3211 src = src_line;
michael@0 3212 src_line += src_stride;
michael@0 3213 w = width;
michael@0 3214
michael@0 3215 while (w && (uintptr_t)dst & 7)
michael@0 3216 {
michael@0 3217 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
michael@0 3218 load ((const uint32_t *)dst)));
michael@0 3219 dst++;
michael@0 3220 src++;
michael@0 3221 w--;
michael@0 3222 }
michael@0 3223
michael@0 3224 while (w >= 2)
michael@0 3225 {
michael@0 3226 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
michael@0 3227 dst += 2;
michael@0 3228 src += 2;
michael@0 3229 w -= 2;
michael@0 3230 }
michael@0 3231
michael@0 3232 if (w)
michael@0 3233 {
michael@0 3234 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
michael@0 3235 load ((const uint32_t *)dst)));
michael@0 3236
michael@0 3237 }
michael@0 3238 }
michael@0 3239
michael@0 3240 _mm_empty ();
michael@0 3241 }
michael@0 3242
michael@0 3243 static pixman_bool_t
michael@0 3244 mmx_blt (pixman_implementation_t *imp,
michael@0 3245 uint32_t * src_bits,
michael@0 3246 uint32_t * dst_bits,
michael@0 3247 int src_stride,
michael@0 3248 int dst_stride,
michael@0 3249 int src_bpp,
michael@0 3250 int dst_bpp,
michael@0 3251 int src_x,
michael@0 3252 int src_y,
michael@0 3253 int dest_x,
michael@0 3254 int dest_y,
michael@0 3255 int width,
michael@0 3256 int height)
michael@0 3257 {
michael@0 3258 uint8_t * src_bytes;
michael@0 3259 uint8_t * dst_bytes;
michael@0 3260 int byte_width;
michael@0 3261
michael@0 3262 if (src_bpp != dst_bpp)
michael@0 3263 return FALSE;
michael@0 3264
michael@0 3265 if (src_bpp == 16)
michael@0 3266 {
michael@0 3267 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
michael@0 3268 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
michael@0 3269 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
michael@0 3270 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
michael@0 3271 byte_width = 2 * width;
michael@0 3272 src_stride *= 2;
michael@0 3273 dst_stride *= 2;
michael@0 3274 }
michael@0 3275 else if (src_bpp == 32)
michael@0 3276 {
michael@0 3277 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
michael@0 3278 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
michael@0 3279 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
michael@0 3280 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
michael@0 3281 byte_width = 4 * width;
michael@0 3282 src_stride *= 4;
michael@0 3283 dst_stride *= 4;
michael@0 3284 }
michael@0 3285 else
michael@0 3286 {
michael@0 3287 return FALSE;
michael@0 3288 }
michael@0 3289
michael@0 3290 while (height--)
michael@0 3291 {
michael@0 3292 int w;
michael@0 3293 uint8_t *s = src_bytes;
michael@0 3294 uint8_t *d = dst_bytes;
michael@0 3295 src_bytes += src_stride;
michael@0 3296 dst_bytes += dst_stride;
michael@0 3297 w = byte_width;
michael@0 3298
michael@0 3299 if (w >= 1 && ((uintptr_t)d & 1))
michael@0 3300 {
michael@0 3301 *(uint8_t *)d = *(uint8_t *)s;
michael@0 3302 w -= 1;
michael@0 3303 s += 1;
michael@0 3304 d += 1;
michael@0 3305 }
michael@0 3306
michael@0 3307 if (w >= 2 && ((uintptr_t)d & 3))
michael@0 3308 {
michael@0 3309 *(uint16_t *)d = *(uint16_t *)s;
michael@0 3310 w -= 2;
michael@0 3311 s += 2;
michael@0 3312 d += 2;
michael@0 3313 }
michael@0 3314
michael@0 3315 while (w >= 4 && ((uintptr_t)d & 7))
michael@0 3316 {
michael@0 3317 *(uint32_t *)d = ldl_u ((uint32_t *)s);
michael@0 3318
michael@0 3319 w -= 4;
michael@0 3320 s += 4;
michael@0 3321 d += 4;
michael@0 3322 }
michael@0 3323
michael@0 3324 while (w >= 64)
michael@0 3325 {
michael@0 3326 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
michael@0 3327 __asm__ (
michael@0 3328 "movq (%1), %%mm0\n"
michael@0 3329 "movq 8(%1), %%mm1\n"
michael@0 3330 "movq 16(%1), %%mm2\n"
michael@0 3331 "movq 24(%1), %%mm3\n"
michael@0 3332 "movq 32(%1), %%mm4\n"
michael@0 3333 "movq 40(%1), %%mm5\n"
michael@0 3334 "movq 48(%1), %%mm6\n"
michael@0 3335 "movq 56(%1), %%mm7\n"
michael@0 3336
michael@0 3337 "movq %%mm0, (%0)\n"
michael@0 3338 "movq %%mm1, 8(%0)\n"
michael@0 3339 "movq %%mm2, 16(%0)\n"
michael@0 3340 "movq %%mm3, 24(%0)\n"
michael@0 3341 "movq %%mm4, 32(%0)\n"
michael@0 3342 "movq %%mm5, 40(%0)\n"
michael@0 3343 "movq %%mm6, 48(%0)\n"
michael@0 3344 "movq %%mm7, 56(%0)\n"
michael@0 3345 :
michael@0 3346 : "r" (d), "r" (s)
michael@0 3347 : "memory",
michael@0 3348 "%mm0", "%mm1", "%mm2", "%mm3",
michael@0 3349 "%mm4", "%mm5", "%mm6", "%mm7");
michael@0 3350 #else
michael@0 3351 __m64 v0 = ldq_u ((__m64 *)(s + 0));
michael@0 3352 __m64 v1 = ldq_u ((__m64 *)(s + 8));
michael@0 3353 __m64 v2 = ldq_u ((__m64 *)(s + 16));
michael@0 3354 __m64 v3 = ldq_u ((__m64 *)(s + 24));
michael@0 3355 __m64 v4 = ldq_u ((__m64 *)(s + 32));
michael@0 3356 __m64 v5 = ldq_u ((__m64 *)(s + 40));
michael@0 3357 __m64 v6 = ldq_u ((__m64 *)(s + 48));
michael@0 3358 __m64 v7 = ldq_u ((__m64 *)(s + 56));
michael@0 3359 *(__m64 *)(d + 0) = v0;
michael@0 3360 *(__m64 *)(d + 8) = v1;
michael@0 3361 *(__m64 *)(d + 16) = v2;
michael@0 3362 *(__m64 *)(d + 24) = v3;
michael@0 3363 *(__m64 *)(d + 32) = v4;
michael@0 3364 *(__m64 *)(d + 40) = v5;
michael@0 3365 *(__m64 *)(d + 48) = v6;
michael@0 3366 *(__m64 *)(d + 56) = v7;
michael@0 3367 #endif
michael@0 3368
michael@0 3369 w -= 64;
michael@0 3370 s += 64;
michael@0 3371 d += 64;
michael@0 3372 }
michael@0 3373 while (w >= 4)
michael@0 3374 {
michael@0 3375 *(uint32_t *)d = ldl_u ((uint32_t *)s);
michael@0 3376
michael@0 3377 w -= 4;
michael@0 3378 s += 4;
michael@0 3379 d += 4;
michael@0 3380 }
michael@0 3381 if (w >= 2)
michael@0 3382 {
michael@0 3383 *(uint16_t *)d = *(uint16_t *)s;
michael@0 3384 w -= 2;
michael@0 3385 s += 2;
michael@0 3386 d += 2;
michael@0 3387 }
michael@0 3388 }
michael@0 3389
michael@0 3390 _mm_empty ();
michael@0 3391
michael@0 3392 return TRUE;
michael@0 3393 }
michael@0 3394
michael@0 3395 static void
michael@0 3396 mmx_composite_copy_area (pixman_implementation_t *imp,
michael@0 3397 pixman_composite_info_t *info)
michael@0 3398 {
michael@0 3399 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3400
michael@0 3401 mmx_blt (imp, src_image->bits.bits,
michael@0 3402 dest_image->bits.bits,
michael@0 3403 src_image->bits.rowstride,
michael@0 3404 dest_image->bits.rowstride,
michael@0 3405 PIXMAN_FORMAT_BPP (src_image->bits.format),
michael@0 3406 PIXMAN_FORMAT_BPP (dest_image->bits.format),
michael@0 3407 src_x, src_y, dest_x, dest_y, width, height);
michael@0 3408 }
michael@0 3409
michael@0 3410 static void
michael@0 3411 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
michael@0 3412 pixman_composite_info_t *info)
michael@0 3413 {
michael@0 3414 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3415 uint32_t *src, *src_line;
michael@0 3416 uint32_t *dst, *dst_line;
michael@0 3417 uint8_t *mask, *mask_line;
michael@0 3418 int src_stride, mask_stride, dst_stride;
michael@0 3419 int32_t w;
michael@0 3420
michael@0 3421 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3422 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 3423 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 3424
michael@0 3425 while (height--)
michael@0 3426 {
michael@0 3427 src = src_line;
michael@0 3428 src_line += src_stride;
michael@0 3429 dst = dst_line;
michael@0 3430 dst_line += dst_stride;
michael@0 3431 mask = mask_line;
michael@0 3432 mask_line += mask_stride;
michael@0 3433
michael@0 3434 w = width;
michael@0 3435
michael@0 3436 while (w--)
michael@0 3437 {
michael@0 3438 uint64_t m = *mask;
michael@0 3439
michael@0 3440 if (m)
michael@0 3441 {
michael@0 3442 uint32_t ssrc = *src | 0xff000000;
michael@0 3443 __m64 s = load8888 (&ssrc);
michael@0 3444
michael@0 3445 if (m == 0xff)
michael@0 3446 {
michael@0 3447 store8888 (dst, s);
michael@0 3448 }
michael@0 3449 else
michael@0 3450 {
michael@0 3451 __m64 sa = expand_alpha (s);
michael@0 3452 __m64 vm = expand_alpha_rev (to_m64 (m));
michael@0 3453 __m64 vdest = in_over (s, sa, vm, load8888 (dst));
michael@0 3454
michael@0 3455 store8888 (dst, vdest);
michael@0 3456 }
michael@0 3457 }
michael@0 3458
michael@0 3459 mask++;
michael@0 3460 dst++;
michael@0 3461 src++;
michael@0 3462 }
michael@0 3463 }
michael@0 3464
michael@0 3465 _mm_empty ();
michael@0 3466 }
michael@0 3467
michael@0 3468 static void
michael@0 3469 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
michael@0 3470 pixman_composite_info_t *info)
michael@0 3471 {
michael@0 3472 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3473 uint32_t src;
michael@0 3474 uint32_t *dst_line, *dst;
michael@0 3475 int32_t w;
michael@0 3476 int dst_stride;
michael@0 3477 __m64 vsrc;
michael@0 3478
michael@0 3479 CHECKPOINT ();
michael@0 3480
michael@0 3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 3482
michael@0 3483 if (src == 0)
michael@0 3484 return;
michael@0 3485
michael@0 3486 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3487
michael@0 3488 vsrc = load8888 (&src);
michael@0 3489
michael@0 3490 while (height--)
michael@0 3491 {
michael@0 3492 dst = dst_line;
michael@0 3493 dst_line += dst_stride;
michael@0 3494 w = width;
michael@0 3495
michael@0 3496 CHECKPOINT ();
michael@0 3497
michael@0 3498 while (w && (uintptr_t)dst & 7)
michael@0 3499 {
michael@0 3500 __m64 vdest = load8888 (dst);
michael@0 3501
michael@0 3502 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
michael@0 3503
michael@0 3504 w--;
michael@0 3505 dst++;
michael@0 3506 }
michael@0 3507
michael@0 3508 while (w >= 2)
michael@0 3509 {
michael@0 3510 __m64 vdest = *(__m64 *)dst;
michael@0 3511 __m64 dest0 = expand8888 (vdest, 0);
michael@0 3512 __m64 dest1 = expand8888 (vdest, 1);
michael@0 3513
michael@0 3514
michael@0 3515 dest0 = over (dest0, expand_alpha (dest0), vsrc);
michael@0 3516 dest1 = over (dest1, expand_alpha (dest1), vsrc);
michael@0 3517
michael@0 3518 *(__m64 *)dst = pack8888 (dest0, dest1);
michael@0 3519
michael@0 3520 dst += 2;
michael@0 3521 w -= 2;
michael@0 3522 }
michael@0 3523
michael@0 3524 CHECKPOINT ();
michael@0 3525
michael@0 3526 if (w)
michael@0 3527 {
michael@0 3528 __m64 vdest = load8888 (dst);
michael@0 3529
michael@0 3530 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
michael@0 3531 }
michael@0 3532 }
michael@0 3533
michael@0 3534 _mm_empty ();
michael@0 3535 }
michael@0 3536
michael@0 3537 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
michael@0 3538 #define BMSK (BSHIFT - 1)
michael@0 3539
michael@0 3540 #define BILINEAR_DECLARE_VARIABLES \
michael@0 3541 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
michael@0 3542 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
michael@0 3543 const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
michael@0 3544 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
michael@0 3545 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
michael@0 3546 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
michael@0 3547 const __m64 mm_zero = _mm_setzero_si64 (); \
michael@0 3548 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
michael@0 3549
michael@0 3550 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
michael@0 3551 do { \
michael@0 3552 /* fetch 2x2 pixel block into 2 mmx registers */ \
michael@0 3553 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
michael@0 3554 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
michael@0 3555 /* vertical interpolation */ \
michael@0 3556 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
michael@0 3557 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
michael@0 3558 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
michael@0 3559 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
michael@0 3560 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \
michael@0 3561 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \
michael@0 3562 vx += unit_x; \
michael@0 3563 if (BILINEAR_INTERPOLATION_BITS < 8) \
michael@0 3564 { \
michael@0 3565 /* calculate horizontal weights */ \
michael@0 3566 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
michael@0 3567 _mm_srli_pi16 (mm_x, \
michael@0 3568 16 - BILINEAR_INTERPOLATION_BITS))); \
michael@0 3569 /* horizontal interpolation */ \
michael@0 3570 __m64 p = _mm_unpacklo_pi16 (lo, hi); \
michael@0 3571 __m64 q = _mm_unpackhi_pi16 (lo, hi); \
michael@0 3572 lo = _mm_madd_pi16 (p, mm_wh); \
michael@0 3573 hi = _mm_madd_pi16 (q, mm_wh); \
michael@0 3574 } \
michael@0 3575 else \
michael@0 3576 { \
michael@0 3577 /* calculate horizontal weights */ \
michael@0 3578 __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
michael@0 3579 16 - BILINEAR_INTERPOLATION_BITS)); \
michael@0 3580 __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
michael@0 3581 16 - BILINEAR_INTERPOLATION_BITS); \
michael@0 3582 /* horizontal interpolation */ \
michael@0 3583 __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
michael@0 3584 __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
michael@0 3585 __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
michael@0 3586 __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
michael@0 3587 lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
michael@0 3588 _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
michael@0 3589 hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
michael@0 3590 _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
michael@0 3591 } \
michael@0 3592 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
michael@0 3593 /* shift and pack the result */ \
michael@0 3594 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
michael@0 3595 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
michael@0 3596 lo = _mm_packs_pi32 (lo, hi); \
michael@0 3597 lo = _mm_packs_pu16 (lo, lo); \
michael@0 3598 pix = lo; \
michael@0 3599 } while (0)
michael@0 3600
michael@0 3601 #define BILINEAR_SKIP_ONE_PIXEL() \
michael@0 3602 do { \
michael@0 3603 vx += unit_x; \
michael@0 3604 mm_x = _mm_add_pi16 (mm_x, mm_ux); \
michael@0 3605 } while(0)
michael@0 3606
michael@0 3607 static force_inline void
michael@0 3608 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
michael@0 3609 const uint32_t * mask,
michael@0 3610 const uint32_t * src_top,
michael@0 3611 const uint32_t * src_bottom,
michael@0 3612 int32_t w,
michael@0 3613 int wt,
michael@0 3614 int wb,
michael@0 3615 pixman_fixed_t vx,
michael@0 3616 pixman_fixed_t unit_x,
michael@0 3617 pixman_fixed_t max_vx,
michael@0 3618 pixman_bool_t zero_src)
michael@0 3619 {
michael@0 3620 BILINEAR_DECLARE_VARIABLES;
michael@0 3621 __m64 pix;
michael@0 3622
michael@0 3623 while (w--)
michael@0 3624 {
michael@0 3625 BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
michael@0 3626 store (dst, pix);
michael@0 3627 dst++;
michael@0 3628 }
michael@0 3629
michael@0 3630 _mm_empty ();
michael@0 3631 }
michael@0 3632
michael@0 3633 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
michael@0 3634 scaled_bilinear_scanline_mmx_8888_8888_SRC,
michael@0 3635 uint32_t, uint32_t, uint32_t,
michael@0 3636 COVER, FLAG_NONE)
michael@0 3637 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
michael@0 3638 scaled_bilinear_scanline_mmx_8888_8888_SRC,
michael@0 3639 uint32_t, uint32_t, uint32_t,
michael@0 3640 PAD, FLAG_NONE)
michael@0 3641 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
michael@0 3642 scaled_bilinear_scanline_mmx_8888_8888_SRC,
michael@0 3643 uint32_t, uint32_t, uint32_t,
michael@0 3644 NONE, FLAG_NONE)
michael@0 3645 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
michael@0 3646 scaled_bilinear_scanline_mmx_8888_8888_SRC,
michael@0 3647 uint32_t, uint32_t, uint32_t,
michael@0 3648 NORMAL, FLAG_NONE)
michael@0 3649
michael@0 3650 static force_inline void
michael@0 3651 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
michael@0 3652 const uint32_t * mask,
michael@0 3653 const uint32_t * src_top,
michael@0 3654 const uint32_t * src_bottom,
michael@0 3655 int32_t w,
michael@0 3656 int wt,
michael@0 3657 int wb,
michael@0 3658 pixman_fixed_t vx,
michael@0 3659 pixman_fixed_t unit_x,
michael@0 3660 pixman_fixed_t max_vx,
michael@0 3661 pixman_bool_t zero_src)
michael@0 3662 {
michael@0 3663 BILINEAR_DECLARE_VARIABLES;
michael@0 3664 __m64 pix1, pix2;
michael@0 3665
michael@0 3666 while (w)
michael@0 3667 {
michael@0 3668 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 3669
michael@0 3670 if (!is_zero (pix1))
michael@0 3671 {
michael@0 3672 pix2 = load (dst);
michael@0 3673 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
michael@0 3674 }
michael@0 3675
michael@0 3676 w--;
michael@0 3677 dst++;
michael@0 3678 }
michael@0 3679
michael@0 3680 _mm_empty ();
michael@0 3681 }
michael@0 3682
michael@0 3683 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
michael@0 3684 scaled_bilinear_scanline_mmx_8888_8888_OVER,
michael@0 3685 uint32_t, uint32_t, uint32_t,
michael@0 3686 COVER, FLAG_NONE)
michael@0 3687 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
michael@0 3688 scaled_bilinear_scanline_mmx_8888_8888_OVER,
michael@0 3689 uint32_t, uint32_t, uint32_t,
michael@0 3690 PAD, FLAG_NONE)
michael@0 3691 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
michael@0 3692 scaled_bilinear_scanline_mmx_8888_8888_OVER,
michael@0 3693 uint32_t, uint32_t, uint32_t,
michael@0 3694 NONE, FLAG_NONE)
michael@0 3695 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
michael@0 3696 scaled_bilinear_scanline_mmx_8888_8888_OVER,
michael@0 3697 uint32_t, uint32_t, uint32_t,
michael@0 3698 NORMAL, FLAG_NONE)
michael@0 3699
michael@0 3700 static force_inline void
michael@0 3701 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
michael@0 3702 const uint8_t * mask,
michael@0 3703 const uint32_t * src_top,
michael@0 3704 const uint32_t * src_bottom,
michael@0 3705 int32_t w,
michael@0 3706 int wt,
michael@0 3707 int wb,
michael@0 3708 pixman_fixed_t vx,
michael@0 3709 pixman_fixed_t unit_x,
michael@0 3710 pixman_fixed_t max_vx,
michael@0 3711 pixman_bool_t zero_src)
michael@0 3712 {
michael@0 3713 BILINEAR_DECLARE_VARIABLES;
michael@0 3714 __m64 pix1, pix2;
michael@0 3715 uint32_t m;
michael@0 3716
michael@0 3717 while (w)
michael@0 3718 {
michael@0 3719 m = (uint32_t) *mask++;
michael@0 3720
michael@0 3721 if (m)
michael@0 3722 {
michael@0 3723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 3724
michael@0 3725 if (m == 0xff && is_opaque (pix1))
michael@0 3726 {
michael@0 3727 store (dst, pix1);
michael@0 3728 }
michael@0 3729 else
michael@0 3730 {
michael@0 3731 __m64 ms, md, ma, msa;
michael@0 3732
michael@0 3733 pix2 = load (dst);
michael@0 3734 ma = expand_alpha_rev (to_m64 (m));
michael@0 3735 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
michael@0 3736 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
michael@0 3737
michael@0 3738 msa = expand_alpha (ms);
michael@0 3739
michael@0 3740 store8888 (dst, (in_over (ms, msa, ma, md)));
michael@0 3741 }
michael@0 3742 }
michael@0 3743 else
michael@0 3744 {
michael@0 3745 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 3746 }
michael@0 3747
michael@0 3748 w--;
michael@0 3749 dst++;
michael@0 3750 }
michael@0 3751
michael@0 3752 _mm_empty ();
michael@0 3753 }
michael@0 3754
michael@0 3755 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
michael@0 3756 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
michael@0 3757 uint32_t, uint8_t, uint32_t,
michael@0 3758 COVER, FLAG_HAVE_NON_SOLID_MASK)
michael@0 3759 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
michael@0 3760 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
michael@0 3761 uint32_t, uint8_t, uint32_t,
michael@0 3762 PAD, FLAG_HAVE_NON_SOLID_MASK)
michael@0 3763 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
michael@0 3764 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
michael@0 3765 uint32_t, uint8_t, uint32_t,
michael@0 3766 NONE, FLAG_HAVE_NON_SOLID_MASK)
michael@0 3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
michael@0 3768 scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
michael@0 3769 uint32_t, uint8_t, uint32_t,
michael@0 3770 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
michael@0 3771
michael@0 3772 static uint32_t *
michael@0 3773 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
michael@0 3774 {
michael@0 3775 int w = iter->width;
michael@0 3776 uint32_t *dst = iter->buffer;
michael@0 3777 uint32_t *src = (uint32_t *)iter->bits;
michael@0 3778
michael@0 3779 iter->bits += iter->stride;
michael@0 3780
michael@0 3781 while (w && ((uintptr_t)dst) & 7)
michael@0 3782 {
michael@0 3783 *dst++ = (*src++) | 0xff000000;
michael@0 3784 w--;
michael@0 3785 }
michael@0 3786
michael@0 3787 while (w >= 8)
michael@0 3788 {
michael@0 3789 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
michael@0 3790 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
michael@0 3791 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
michael@0 3792 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
michael@0 3793
michael@0 3794 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
michael@0 3795 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
michael@0 3796 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
michael@0 3797 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
michael@0 3798
michael@0 3799 dst += 8;
michael@0 3800 src += 8;
michael@0 3801 w -= 8;
michael@0 3802 }
michael@0 3803
michael@0 3804 while (w)
michael@0 3805 {
michael@0 3806 *dst++ = (*src++) | 0xff000000;
michael@0 3807 w--;
michael@0 3808 }
michael@0 3809
michael@0 3810 _mm_empty ();
michael@0 3811 return iter->buffer;
michael@0 3812 }
michael@0 3813
michael@0 3814 static uint32_t *
michael@0 3815 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
michael@0 3816 {
michael@0 3817 int w = iter->width;
michael@0 3818 uint32_t *dst = iter->buffer;
michael@0 3819 uint16_t *src = (uint16_t *)iter->bits;
michael@0 3820
michael@0 3821 iter->bits += iter->stride;
michael@0 3822
michael@0 3823 while (w && ((uintptr_t)dst) & 0x0f)
michael@0 3824 {
michael@0 3825 uint16_t s = *src++;
michael@0 3826
michael@0 3827 *dst++ = convert_0565_to_8888 (s);
michael@0 3828 w--;
michael@0 3829 }
michael@0 3830
michael@0 3831 while (w >= 4)
michael@0 3832 {
michael@0 3833 __m64 vsrc = ldq_u ((__m64 *)src);
michael@0 3834 __m64 mm0, mm1;
michael@0 3835
michael@0 3836 expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
michael@0 3837
michael@0 3838 *(__m64 *)(dst + 0) = mm0;
michael@0 3839 *(__m64 *)(dst + 2) = mm1;
michael@0 3840
michael@0 3841 dst += 4;
michael@0 3842 src += 4;
michael@0 3843 w -= 4;
michael@0 3844 }
michael@0 3845
michael@0 3846 while (w)
michael@0 3847 {
michael@0 3848 uint16_t s = *src++;
michael@0 3849
michael@0 3850 *dst++ = convert_0565_to_8888 (s);
michael@0 3851 w--;
michael@0 3852 }
michael@0 3853
michael@0 3854 _mm_empty ();
michael@0 3855 return iter->buffer;
michael@0 3856 }
michael@0 3857
michael@0 3858 static uint32_t *
michael@0 3859 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
michael@0 3860 {
michael@0 3861 int w = iter->width;
michael@0 3862 uint32_t *dst = iter->buffer;
michael@0 3863 uint8_t *src = iter->bits;
michael@0 3864
michael@0 3865 iter->bits += iter->stride;
michael@0 3866
michael@0 3867 while (w && (((uintptr_t)dst) & 15))
michael@0 3868 {
michael@0 3869 *dst++ = *(src++) << 24;
michael@0 3870 w--;
michael@0 3871 }
michael@0 3872
michael@0 3873 while (w >= 8)
michael@0 3874 {
michael@0 3875 __m64 mm0 = ldq_u ((__m64 *)src);
michael@0 3876
michael@0 3877 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
michael@0 3878 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
michael@0 3879 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
michael@0 3880 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
michael@0 3881 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
michael@0 3882 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
michael@0 3883
michael@0 3884 *(__m64 *)(dst + 0) = mm3;
michael@0 3885 *(__m64 *)(dst + 2) = mm4;
michael@0 3886 *(__m64 *)(dst + 4) = mm5;
michael@0 3887 *(__m64 *)(dst + 6) = mm6;
michael@0 3888
michael@0 3889 dst += 8;
michael@0 3890 src += 8;
michael@0 3891 w -= 8;
michael@0 3892 }
michael@0 3893
michael@0 3894 while (w)
michael@0 3895 {
michael@0 3896 *dst++ = *(src++) << 24;
michael@0 3897 w--;
michael@0 3898 }
michael@0 3899
michael@0 3900 _mm_empty ();
michael@0 3901 return iter->buffer;
michael@0 3902 }
michael@0 3903
michael@0 3904 typedef struct
michael@0 3905 {
michael@0 3906 pixman_format_code_t format;
michael@0 3907 pixman_iter_get_scanline_t get_scanline;
michael@0 3908 } fetcher_info_t;
michael@0 3909
michael@0 3910 static const fetcher_info_t fetchers[] =
michael@0 3911 {
michael@0 3912 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 },
michael@0 3913 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
michael@0 3914 { PIXMAN_a8, mmx_fetch_a8 },
michael@0 3915 { PIXMAN_null }
michael@0 3916 };
michael@0 3917
michael@0 3918 static pixman_bool_t
michael@0 3919 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
michael@0 3920 {
michael@0 3921 pixman_image_t *image = iter->image;
michael@0 3922
michael@0 3923 #define FLAGS \
michael@0 3924 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
michael@0 3925 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
michael@0 3926
michael@0 3927 if ((iter->iter_flags & ITER_NARROW) &&
michael@0 3928 (iter->image_flags & FLAGS) == FLAGS)
michael@0 3929 {
michael@0 3930 const fetcher_info_t *f;
michael@0 3931
michael@0 3932 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
michael@0 3933 {
michael@0 3934 if (image->common.extended_format_code == f->format)
michael@0 3935 {
michael@0 3936 uint8_t *b = (uint8_t *)image->bits.bits;
michael@0 3937 int s = image->bits.rowstride * 4;
michael@0 3938
michael@0 3939 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
michael@0 3940 iter->stride = s;
michael@0 3941
michael@0 3942 iter->get_scanline = f->get_scanline;
michael@0 3943 return TRUE;
michael@0 3944 }
michael@0 3945 }
michael@0 3946 }
michael@0 3947
michael@0 3948 return FALSE;
michael@0 3949 }
michael@0 3950
michael@0 3951 static const pixman_fast_path_t mmx_fast_paths[] =
michael@0 3952 {
michael@0 3953 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
michael@0 3954 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
michael@0 3955 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
michael@0 3956 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
michael@0 3957 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
michael@0 3958 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
michael@0 3959 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
michael@0 3960 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
michael@0 3961 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
michael@0 3962 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
michael@0 3963 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
michael@0 3964 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
michael@0 3965 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
michael@0 3966 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
michael@0 3967 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
michael@0 3968 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
michael@0 3969 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
michael@0 3970 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
michael@0 3971 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
michael@0 3972 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
michael@0 3973 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
michael@0 3974 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
michael@0 3975 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
michael@0 3976 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
michael@0 3977 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
michael@0 3978 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
michael@0 3979 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
michael@0 3980 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
michael@0 3981 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
michael@0 3982 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
michael@0 3983 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
michael@0 3984 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
michael@0 3985 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
michael@0 3986 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
michael@0 3987 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
michael@0 3988 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
michael@0 3989
michael@0 3990 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
michael@0 3991 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
michael@0 3992 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
michael@0 3993 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
michael@0 3994 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
michael@0 3995 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
michael@0 3996
michael@0 3997 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
michael@0 3998 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
michael@0 3999
michael@0 4000 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
michael@0 4001 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
michael@0 4002 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
michael@0 4003 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
michael@0 4004 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
michael@0 4005 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
michael@0 4006
michael@0 4007 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
michael@0 4008 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
michael@0 4009 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
michael@0 4010 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
michael@0 4011 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
michael@0 4012 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
michael@0 4013 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
michael@0 4014 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
michael@0 4015 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
michael@0 4016 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
michael@0 4017 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
michael@0 4018 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
michael@0 4019 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
michael@0 4020 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
michael@0 4021 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
michael@0 4022 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
michael@0 4023
michael@0 4024 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
michael@0 4025 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
michael@0 4026
michael@0 4027 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
michael@0 4028 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
michael@0 4029 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
michael@0 4030 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
michael@0 4031 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
michael@0 4032 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
michael@0 4033
michael@0 4034 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
michael@0 4035 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
michael@0 4036 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
michael@0 4037 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
michael@0 4038
michael@0 4039 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
michael@0 4040 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
michael@0 4041 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
michael@0 4042 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
michael@0 4043
michael@0 4044 { PIXMAN_OP_NONE },
michael@0 4045 };
michael@0 4046
michael@0 4047 pixman_implementation_t *
michael@0 4048 _pixman_implementation_create_mmx (pixman_implementation_t *fallback)
michael@0 4049 {
michael@0 4050 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
michael@0 4051
michael@0 4052 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
michael@0 4053 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
michael@0 4054 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
michael@0 4055 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
michael@0 4056 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
michael@0 4057 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
michael@0 4058 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
michael@0 4059 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
michael@0 4060 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
michael@0 4061 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
michael@0 4062 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
michael@0 4063
michael@0 4064 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
michael@0 4065 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
michael@0 4066 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
michael@0 4067 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
michael@0 4068 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
michael@0 4069 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
michael@0 4070 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
michael@0 4071 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
michael@0 4072 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
michael@0 4073 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
michael@0 4074 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
michael@0 4075
michael@0 4076 imp->blt = mmx_blt;
michael@0 4077 imp->fill = mmx_fill;
michael@0 4078
michael@0 4079 imp->src_iter_init = mmx_src_iter_init;
michael@0 4080
michael@0 4081 return imp;
michael@0 4082 }
michael@0 4083
michael@0 4084 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */

mercurial