gfx/cairo/libpixman/src/pixman-sse2.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /*
michael@0 2 * Copyright © 2008 Rodrigo Kumpera
michael@0 3 * Copyright © 2008 André Tupinambá
michael@0 4 *
michael@0 5 * Permission to use, copy, modify, distribute, and sell this software and its
michael@0 6 * documentation for any purpose is hereby granted without fee, provided that
michael@0 7 * the above copyright notice appear in all copies and that both that
michael@0 8 * copyright notice and this permission notice appear in supporting
michael@0 9 * documentation, and that the name of Red Hat not be used in advertising or
michael@0 10 * publicity pertaining to distribution of the software without specific,
michael@0 11 * written prior permission. Red Hat makes no representations about the
michael@0 12 * suitability of this software for any purpose. It is provided "as is"
michael@0 13 * without express or implied warranty.
michael@0 14 *
michael@0 15 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
michael@0 16 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
michael@0 17 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
michael@0 18 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
michael@0 19 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
michael@0 20 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
michael@0 21 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
michael@0 22 * SOFTWARE.
michael@0 23 *
michael@0 24 * Author: Rodrigo Kumpera (kumpera@gmail.com)
michael@0 25 * André Tupinambá (andrelrt@gmail.com)
michael@0 26 *
michael@0 27 * Based on work by Owen Taylor and Søren Sandmann
michael@0 28 */
michael@0 29 #ifdef HAVE_CONFIG_H
michael@0 30 #include <config.h>
michael@0 31 #endif
michael@0 32
michael@0 33 #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
michael@0 34 #include <emmintrin.h> /* for SSE2 intrinsics */
michael@0 35 #include "pixman-private.h"
michael@0 36 #include "pixman-combine32.h"
michael@0 37 #include "pixman-inlines.h"
michael@0 38
michael@0 39 static __m128i mask_0080;
michael@0 40 static __m128i mask_00ff;
michael@0 41 static __m128i mask_0101;
michael@0 42 static __m128i mask_ffff;
michael@0 43 static __m128i mask_ff000000;
michael@0 44 static __m128i mask_alpha;
michael@0 45
michael@0 46 static __m128i mask_565_r;
michael@0 47 static __m128i mask_565_g1, mask_565_g2;
michael@0 48 static __m128i mask_565_b;
michael@0 49 static __m128i mask_red;
michael@0 50 static __m128i mask_green;
michael@0 51 static __m128i mask_blue;
michael@0 52
michael@0 53 static __m128i mask_565_fix_rb;
michael@0 54 static __m128i mask_565_fix_g;
michael@0 55
michael@0 56 static __m128i mask_565_rb;
michael@0 57 static __m128i mask_565_pack_multiplier;
michael@0 58
michael@0 59 static force_inline __m128i
michael@0 60 unpack_32_1x128 (uint32_t data)
michael@0 61 {
michael@0 62 return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
michael@0 63 }
michael@0 64
michael@0 65 static force_inline void
michael@0 66 unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
michael@0 67 {
michael@0 68 *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
michael@0 69 *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
michael@0 70 }
michael@0 71
michael@0 72 static force_inline __m128i
michael@0 73 unpack_565_to_8888 (__m128i lo)
michael@0 74 {
michael@0 75 __m128i r, g, b, rb, t;
michael@0 76
michael@0 77 r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
michael@0 78 g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
michael@0 79 b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
michael@0 80
michael@0 81 rb = _mm_or_si128 (r, b);
michael@0 82 t = _mm_and_si128 (rb, mask_565_fix_rb);
michael@0 83 t = _mm_srli_epi32 (t, 5);
michael@0 84 rb = _mm_or_si128 (rb, t);
michael@0 85
michael@0 86 t = _mm_and_si128 (g, mask_565_fix_g);
michael@0 87 t = _mm_srli_epi32 (t, 6);
michael@0 88 g = _mm_or_si128 (g, t);
michael@0 89
michael@0 90 return _mm_or_si128 (rb, g);
michael@0 91 }
michael@0 92
michael@0 93 static force_inline void
michael@0 94 unpack_565_128_4x128 (__m128i data,
michael@0 95 __m128i* data0,
michael@0 96 __m128i* data1,
michael@0 97 __m128i* data2,
michael@0 98 __m128i* data3)
michael@0 99 {
michael@0 100 __m128i lo, hi;
michael@0 101
michael@0 102 lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
michael@0 103 hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
michael@0 104
michael@0 105 lo = unpack_565_to_8888 (lo);
michael@0 106 hi = unpack_565_to_8888 (hi);
michael@0 107
michael@0 108 unpack_128_2x128 (lo, data0, data1);
michael@0 109 unpack_128_2x128 (hi, data2, data3);
michael@0 110 }
michael@0 111
michael@0 112 static force_inline uint16_t
michael@0 113 pack_565_32_16 (uint32_t pixel)
michael@0 114 {
michael@0 115 return (uint16_t) (((pixel >> 8) & 0xf800) |
michael@0 116 ((pixel >> 5) & 0x07e0) |
michael@0 117 ((pixel >> 3) & 0x001f));
michael@0 118 }
michael@0 119
michael@0 120 static force_inline __m128i
michael@0 121 pack_2x128_128 (__m128i lo, __m128i hi)
michael@0 122 {
michael@0 123 return _mm_packus_epi16 (lo, hi);
michael@0 124 }
michael@0 125
michael@0 126 static force_inline __m128i
michael@0 127 pack_565_2packedx128_128 (__m128i lo, __m128i hi)
michael@0 128 {
michael@0 129 __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
michael@0 130 __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
michael@0 131
michael@0 132 __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
michael@0 133 __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
michael@0 134
michael@0 135 __m128i g0 = _mm_and_si128 (lo, mask_green);
michael@0 136 __m128i g1 = _mm_and_si128 (hi, mask_green);
michael@0 137
michael@0 138 t0 = _mm_or_si128 (t0, g0);
michael@0 139 t1 = _mm_or_si128 (t1, g1);
michael@0 140
michael@0 141 /* Simulates _mm_packus_epi32 */
michael@0 142 t0 = _mm_slli_epi32 (t0, 16 - 5);
michael@0 143 t1 = _mm_slli_epi32 (t1, 16 - 5);
michael@0 144 t0 = _mm_srai_epi32 (t0, 16);
michael@0 145 t1 = _mm_srai_epi32 (t1, 16);
michael@0 146 return _mm_packs_epi32 (t0, t1);
michael@0 147 }
michael@0 148
michael@0 149 static force_inline __m128i
michael@0 150 pack_565_2x128_128 (__m128i lo, __m128i hi)
michael@0 151 {
michael@0 152 __m128i data;
michael@0 153 __m128i r, g1, g2, b;
michael@0 154
michael@0 155 data = pack_2x128_128 (lo, hi);
michael@0 156
michael@0 157 r = _mm_and_si128 (data, mask_565_r);
michael@0 158 g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
michael@0 159 g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
michael@0 160 b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
michael@0 161
michael@0 162 return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
michael@0 163 }
michael@0 164
michael@0 165 static force_inline __m128i
michael@0 166 pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
michael@0 167 {
michael@0 168 return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
michael@0 169 pack_565_2x128_128 (*xmm2, *xmm3));
michael@0 170 }
michael@0 171
michael@0 172 static force_inline int
michael@0 173 is_opaque (__m128i x)
michael@0 174 {
michael@0 175 __m128i ffs = _mm_cmpeq_epi8 (x, x);
michael@0 176
michael@0 177 return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
michael@0 178 }
michael@0 179
michael@0 180 static force_inline int
michael@0 181 is_zero (__m128i x)
michael@0 182 {
michael@0 183 return _mm_movemask_epi8 (
michael@0 184 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
michael@0 185 }
michael@0 186
michael@0 187 static force_inline int
michael@0 188 is_transparent (__m128i x)
michael@0 189 {
michael@0 190 return (_mm_movemask_epi8 (
michael@0 191 _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
michael@0 192 }
michael@0 193
michael@0 194 static force_inline __m128i
michael@0 195 expand_pixel_32_1x128 (uint32_t data)
michael@0 196 {
michael@0 197 return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
michael@0 198 }
michael@0 199
michael@0 200 static force_inline __m128i
michael@0 201 expand_alpha_1x128 (__m128i data)
michael@0 202 {
michael@0 203 return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
michael@0 204 _MM_SHUFFLE (3, 3, 3, 3)),
michael@0 205 _MM_SHUFFLE (3, 3, 3, 3));
michael@0 206 }
michael@0 207
michael@0 208 static force_inline void
michael@0 209 expand_alpha_2x128 (__m128i data_lo,
michael@0 210 __m128i data_hi,
michael@0 211 __m128i* alpha_lo,
michael@0 212 __m128i* alpha_hi)
michael@0 213 {
michael@0 214 __m128i lo, hi;
michael@0 215
michael@0 216 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
michael@0 217 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
michael@0 218
michael@0 219 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
michael@0 220 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
michael@0 221 }
michael@0 222
michael@0 223 static force_inline void
michael@0 224 expand_alpha_rev_2x128 (__m128i data_lo,
michael@0 225 __m128i data_hi,
michael@0 226 __m128i* alpha_lo,
michael@0 227 __m128i* alpha_hi)
michael@0 228 {
michael@0 229 __m128i lo, hi;
michael@0 230
michael@0 231 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
michael@0 232 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
michael@0 233 *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
michael@0 234 *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
michael@0 235 }
michael@0 236
michael@0 237 static force_inline void
michael@0 238 pix_multiply_2x128 (__m128i* data_lo,
michael@0 239 __m128i* data_hi,
michael@0 240 __m128i* alpha_lo,
michael@0 241 __m128i* alpha_hi,
michael@0 242 __m128i* ret_lo,
michael@0 243 __m128i* ret_hi)
michael@0 244 {
michael@0 245 __m128i lo, hi;
michael@0 246
michael@0 247 lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
michael@0 248 hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
michael@0 249 lo = _mm_adds_epu16 (lo, mask_0080);
michael@0 250 hi = _mm_adds_epu16 (hi, mask_0080);
michael@0 251 *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
michael@0 252 *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
michael@0 253 }
michael@0 254
michael@0 255 static force_inline void
michael@0 256 pix_add_multiply_2x128 (__m128i* src_lo,
michael@0 257 __m128i* src_hi,
michael@0 258 __m128i* alpha_dst_lo,
michael@0 259 __m128i* alpha_dst_hi,
michael@0 260 __m128i* dst_lo,
michael@0 261 __m128i* dst_hi,
michael@0 262 __m128i* alpha_src_lo,
michael@0 263 __m128i* alpha_src_hi,
michael@0 264 __m128i* ret_lo,
michael@0 265 __m128i* ret_hi)
michael@0 266 {
michael@0 267 __m128i t1_lo, t1_hi;
michael@0 268 __m128i t2_lo, t2_hi;
michael@0 269
michael@0 270 pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
michael@0 271 pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
michael@0 272
michael@0 273 *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
michael@0 274 *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
michael@0 275 }
michael@0 276
michael@0 277 static force_inline void
michael@0 278 negate_2x128 (__m128i data_lo,
michael@0 279 __m128i data_hi,
michael@0 280 __m128i* neg_lo,
michael@0 281 __m128i* neg_hi)
michael@0 282 {
michael@0 283 *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
michael@0 284 *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
michael@0 285 }
michael@0 286
michael@0 287 static force_inline void
michael@0 288 invert_colors_2x128 (__m128i data_lo,
michael@0 289 __m128i data_hi,
michael@0 290 __m128i* inv_lo,
michael@0 291 __m128i* inv_hi)
michael@0 292 {
michael@0 293 __m128i lo, hi;
michael@0 294
michael@0 295 lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
michael@0 296 hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
michael@0 297 *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
michael@0 298 *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
michael@0 299 }
michael@0 300
michael@0 301 static force_inline void
michael@0 302 over_2x128 (__m128i* src_lo,
michael@0 303 __m128i* src_hi,
michael@0 304 __m128i* alpha_lo,
michael@0 305 __m128i* alpha_hi,
michael@0 306 __m128i* dst_lo,
michael@0 307 __m128i* dst_hi)
michael@0 308 {
michael@0 309 __m128i t1, t2;
michael@0 310
michael@0 311 negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
michael@0 312
michael@0 313 pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
michael@0 314
michael@0 315 *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
michael@0 316 *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
michael@0 317 }
michael@0 318
michael@0 319 static force_inline void
michael@0 320 over_rev_non_pre_2x128 (__m128i src_lo,
michael@0 321 __m128i src_hi,
michael@0 322 __m128i* dst_lo,
michael@0 323 __m128i* dst_hi)
michael@0 324 {
michael@0 325 __m128i lo, hi;
michael@0 326 __m128i alpha_lo, alpha_hi;
michael@0 327
michael@0 328 expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
michael@0 329
michael@0 330 lo = _mm_or_si128 (alpha_lo, mask_alpha);
michael@0 331 hi = _mm_or_si128 (alpha_hi, mask_alpha);
michael@0 332
michael@0 333 invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
michael@0 334
michael@0 335 pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
michael@0 336
michael@0 337 over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
michael@0 338 }
michael@0 339
michael@0 340 static force_inline void
michael@0 341 in_over_2x128 (__m128i* src_lo,
michael@0 342 __m128i* src_hi,
michael@0 343 __m128i* alpha_lo,
michael@0 344 __m128i* alpha_hi,
michael@0 345 __m128i* mask_lo,
michael@0 346 __m128i* mask_hi,
michael@0 347 __m128i* dst_lo,
michael@0 348 __m128i* dst_hi)
michael@0 349 {
michael@0 350 __m128i s_lo, s_hi;
michael@0 351 __m128i a_lo, a_hi;
michael@0 352
michael@0 353 pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
michael@0 354 pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
michael@0 355
michael@0 356 over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
michael@0 357 }
michael@0 358
michael@0 359 /* load 4 pixels from a 16-byte boundary aligned address */
michael@0 360 static force_inline __m128i
michael@0 361 load_128_aligned (__m128i* src)
michael@0 362 {
michael@0 363 return _mm_load_si128 (src);
michael@0 364 }
michael@0 365
michael@0 366 /* load 4 pixels from a unaligned address */
michael@0 367 static force_inline __m128i
michael@0 368 load_128_unaligned (const __m128i* src)
michael@0 369 {
michael@0 370 return _mm_loadu_si128 (src);
michael@0 371 }
michael@0 372
michael@0 373 /* save 4 pixels using Write Combining memory on a 16-byte
michael@0 374 * boundary aligned address
michael@0 375 */
michael@0 376 static force_inline void
michael@0 377 save_128_write_combining (__m128i* dst,
michael@0 378 __m128i data)
michael@0 379 {
michael@0 380 _mm_stream_si128 (dst, data);
michael@0 381 }
michael@0 382
michael@0 383 /* save 4 pixels on a 16-byte boundary aligned address */
michael@0 384 static force_inline void
michael@0 385 save_128_aligned (__m128i* dst,
michael@0 386 __m128i data)
michael@0 387 {
michael@0 388 _mm_store_si128 (dst, data);
michael@0 389 }
michael@0 390
michael@0 391 /* save 4 pixels on a unaligned address */
michael@0 392 static force_inline void
michael@0 393 save_128_unaligned (__m128i* dst,
michael@0 394 __m128i data)
michael@0 395 {
michael@0 396 _mm_storeu_si128 (dst, data);
michael@0 397 }
michael@0 398
michael@0 399 static force_inline __m128i
michael@0 400 load_32_1x128 (uint32_t data)
michael@0 401 {
michael@0 402 return _mm_cvtsi32_si128 (data);
michael@0 403 }
michael@0 404
michael@0 405 static force_inline __m128i
michael@0 406 expand_alpha_rev_1x128 (__m128i data)
michael@0 407 {
michael@0 408 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
michael@0 409 }
michael@0 410
michael@0 411 static force_inline __m128i
michael@0 412 expand_pixel_8_1x128 (uint8_t data)
michael@0 413 {
michael@0 414 return _mm_shufflelo_epi16 (
michael@0 415 unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
michael@0 416 }
michael@0 417
michael@0 418 static force_inline __m128i
michael@0 419 pix_multiply_1x128 (__m128i data,
michael@0 420 __m128i alpha)
michael@0 421 {
michael@0 422 return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
michael@0 423 mask_0080),
michael@0 424 mask_0101);
michael@0 425 }
michael@0 426
michael@0 427 static force_inline __m128i
michael@0 428 pix_add_multiply_1x128 (__m128i* src,
michael@0 429 __m128i* alpha_dst,
michael@0 430 __m128i* dst,
michael@0 431 __m128i* alpha_src)
michael@0 432 {
michael@0 433 __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
michael@0 434 __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
michael@0 435
michael@0 436 return _mm_adds_epu8 (t1, t2);
michael@0 437 }
michael@0 438
michael@0 439 static force_inline __m128i
michael@0 440 negate_1x128 (__m128i data)
michael@0 441 {
michael@0 442 return _mm_xor_si128 (data, mask_00ff);
michael@0 443 }
michael@0 444
michael@0 445 static force_inline __m128i
michael@0 446 invert_colors_1x128 (__m128i data)
michael@0 447 {
michael@0 448 return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
michael@0 449 }
michael@0 450
michael@0 451 static force_inline __m128i
michael@0 452 over_1x128 (__m128i src, __m128i alpha, __m128i dst)
michael@0 453 {
michael@0 454 return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
michael@0 455 }
michael@0 456
michael@0 457 static force_inline __m128i
michael@0 458 in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
michael@0 459 {
michael@0 460 return over_1x128 (pix_multiply_1x128 (*src, *mask),
michael@0 461 pix_multiply_1x128 (*alpha, *mask),
michael@0 462 *dst);
michael@0 463 }
michael@0 464
michael@0 465 static force_inline __m128i
michael@0 466 over_rev_non_pre_1x128 (__m128i src, __m128i dst)
michael@0 467 {
michael@0 468 __m128i alpha = expand_alpha_1x128 (src);
michael@0 469
michael@0 470 return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
michael@0 471 _mm_or_si128 (alpha, mask_alpha)),
michael@0 472 alpha,
michael@0 473 dst);
michael@0 474 }
michael@0 475
michael@0 476 static force_inline uint32_t
michael@0 477 pack_1x128_32 (__m128i data)
michael@0 478 {
michael@0 479 return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
michael@0 480 }
michael@0 481
michael@0 482 static force_inline __m128i
michael@0 483 expand565_16_1x128 (uint16_t pixel)
michael@0 484 {
michael@0 485 __m128i m = _mm_cvtsi32_si128 (pixel);
michael@0 486
michael@0 487 m = unpack_565_to_8888 (m);
michael@0 488
michael@0 489 return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
michael@0 490 }
michael@0 491
michael@0 492 static force_inline uint32_t
michael@0 493 core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
michael@0 494 {
michael@0 495 uint8_t a;
michael@0 496 __m128i xmms;
michael@0 497
michael@0 498 a = src >> 24;
michael@0 499
michael@0 500 if (a == 0xff)
michael@0 501 {
michael@0 502 return src;
michael@0 503 }
michael@0 504 else if (src)
michael@0 505 {
michael@0 506 xmms = unpack_32_1x128 (src);
michael@0 507 return pack_1x128_32 (
michael@0 508 over_1x128 (xmms, expand_alpha_1x128 (xmms),
michael@0 509 unpack_32_1x128 (dst)));
michael@0 510 }
michael@0 511
michael@0 512 return dst;
michael@0 513 }
michael@0 514
michael@0 515 static force_inline uint32_t
michael@0 516 combine1 (const uint32_t *ps, const uint32_t *pm)
michael@0 517 {
michael@0 518 uint32_t s = *ps;
michael@0 519
michael@0 520 if (pm)
michael@0 521 {
michael@0 522 __m128i ms, mm;
michael@0 523
michael@0 524 mm = unpack_32_1x128 (*pm);
michael@0 525 mm = expand_alpha_1x128 (mm);
michael@0 526
michael@0 527 ms = unpack_32_1x128 (s);
michael@0 528 ms = pix_multiply_1x128 (ms, mm);
michael@0 529
michael@0 530 s = pack_1x128_32 (ms);
michael@0 531 }
michael@0 532
michael@0 533 return s;
michael@0 534 }
michael@0 535
michael@0 536 static force_inline __m128i
michael@0 537 combine4 (const __m128i *ps, const __m128i *pm)
michael@0 538 {
michael@0 539 __m128i xmm_src_lo, xmm_src_hi;
michael@0 540 __m128i xmm_msk_lo, xmm_msk_hi;
michael@0 541 __m128i s;
michael@0 542
michael@0 543 if (pm)
michael@0 544 {
michael@0 545 xmm_msk_lo = load_128_unaligned (pm);
michael@0 546
michael@0 547 if (is_transparent (xmm_msk_lo))
michael@0 548 return _mm_setzero_si128 ();
michael@0 549 }
michael@0 550
michael@0 551 s = load_128_unaligned (ps);
michael@0 552
michael@0 553 if (pm)
michael@0 554 {
michael@0 555 unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
michael@0 556 unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
michael@0 557
michael@0 558 expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
michael@0 559
michael@0 560 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 561 &xmm_msk_lo, &xmm_msk_hi,
michael@0 562 &xmm_src_lo, &xmm_src_hi);
michael@0 563
michael@0 564 s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
michael@0 565 }
michael@0 566
michael@0 567 return s;
michael@0 568 }
michael@0 569
michael@0 570 static force_inline void
michael@0 571 core_combine_over_u_sse2_mask (uint32_t * pd,
michael@0 572 const uint32_t* ps,
michael@0 573 const uint32_t* pm,
michael@0 574 int w)
michael@0 575 {
michael@0 576 uint32_t s, d;
michael@0 577
michael@0 578 /* Align dst on a 16-byte boundary */
michael@0 579 while (w && ((uintptr_t)pd & 15))
michael@0 580 {
michael@0 581 d = *pd;
michael@0 582 s = combine1 (ps, pm);
michael@0 583
michael@0 584 if (s)
michael@0 585 *pd = core_combine_over_u_pixel_sse2 (s, d);
michael@0 586 pd++;
michael@0 587 ps++;
michael@0 588 pm++;
michael@0 589 w--;
michael@0 590 }
michael@0 591
michael@0 592 while (w >= 4)
michael@0 593 {
michael@0 594 __m128i mask = load_128_unaligned ((__m128i *)pm);
michael@0 595
michael@0 596 if (!is_zero (mask))
michael@0 597 {
michael@0 598 __m128i src;
michael@0 599 __m128i src_hi, src_lo;
michael@0 600 __m128i mask_hi, mask_lo;
michael@0 601 __m128i alpha_hi, alpha_lo;
michael@0 602
michael@0 603 src = load_128_unaligned ((__m128i *)ps);
michael@0 604
michael@0 605 if (is_opaque (_mm_and_si128 (src, mask)))
michael@0 606 {
michael@0 607 save_128_aligned ((__m128i *)pd, src);
michael@0 608 }
michael@0 609 else
michael@0 610 {
michael@0 611 __m128i dst = load_128_aligned ((__m128i *)pd);
michael@0 612 __m128i dst_hi, dst_lo;
michael@0 613
michael@0 614 unpack_128_2x128 (mask, &mask_lo, &mask_hi);
michael@0 615 unpack_128_2x128 (src, &src_lo, &src_hi);
michael@0 616
michael@0 617 expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
michael@0 618 pix_multiply_2x128 (&src_lo, &src_hi,
michael@0 619 &mask_lo, &mask_hi,
michael@0 620 &src_lo, &src_hi);
michael@0 621
michael@0 622 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
michael@0 623
michael@0 624 expand_alpha_2x128 (src_lo, src_hi,
michael@0 625 &alpha_lo, &alpha_hi);
michael@0 626
michael@0 627 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
michael@0 628 &dst_lo, &dst_hi);
michael@0 629
michael@0 630 save_128_aligned (
michael@0 631 (__m128i *)pd,
michael@0 632 pack_2x128_128 (dst_lo, dst_hi));
michael@0 633 }
michael@0 634 }
michael@0 635
michael@0 636 pm += 4;
michael@0 637 ps += 4;
michael@0 638 pd += 4;
michael@0 639 w -= 4;
michael@0 640 }
michael@0 641 while (w)
michael@0 642 {
michael@0 643 d = *pd;
michael@0 644 s = combine1 (ps, pm);
michael@0 645
michael@0 646 if (s)
michael@0 647 *pd = core_combine_over_u_pixel_sse2 (s, d);
michael@0 648 pd++;
michael@0 649 ps++;
michael@0 650 pm++;
michael@0 651
michael@0 652 w--;
michael@0 653 }
michael@0 654 }
michael@0 655
michael@0 656 static force_inline void
michael@0 657 core_combine_over_u_sse2_no_mask (uint32_t * pd,
michael@0 658 const uint32_t* ps,
michael@0 659 int w)
michael@0 660 {
michael@0 661 uint32_t s, d;
michael@0 662
michael@0 663 /* Align dst on a 16-byte boundary */
michael@0 664 while (w && ((uintptr_t)pd & 15))
michael@0 665 {
michael@0 666 d = *pd;
michael@0 667 s = *ps;
michael@0 668
michael@0 669 if (s)
michael@0 670 *pd = core_combine_over_u_pixel_sse2 (s, d);
michael@0 671 pd++;
michael@0 672 ps++;
michael@0 673 w--;
michael@0 674 }
michael@0 675
michael@0 676 while (w >= 4)
michael@0 677 {
michael@0 678 __m128i src;
michael@0 679 __m128i src_hi, src_lo, dst_hi, dst_lo;
michael@0 680 __m128i alpha_hi, alpha_lo;
michael@0 681
michael@0 682 src = load_128_unaligned ((__m128i *)ps);
michael@0 683
michael@0 684 if (!is_zero (src))
michael@0 685 {
michael@0 686 if (is_opaque (src))
michael@0 687 {
michael@0 688 save_128_aligned ((__m128i *)pd, src);
michael@0 689 }
michael@0 690 else
michael@0 691 {
michael@0 692 __m128i dst = load_128_aligned ((__m128i *)pd);
michael@0 693
michael@0 694 unpack_128_2x128 (src, &src_lo, &src_hi);
michael@0 695 unpack_128_2x128 (dst, &dst_lo, &dst_hi);
michael@0 696
michael@0 697 expand_alpha_2x128 (src_lo, src_hi,
michael@0 698 &alpha_lo, &alpha_hi);
michael@0 699 over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
michael@0 700 &dst_lo, &dst_hi);
michael@0 701
michael@0 702 save_128_aligned (
michael@0 703 (__m128i *)pd,
michael@0 704 pack_2x128_128 (dst_lo, dst_hi));
michael@0 705 }
michael@0 706 }
michael@0 707
michael@0 708 ps += 4;
michael@0 709 pd += 4;
michael@0 710 w -= 4;
michael@0 711 }
michael@0 712 while (w)
michael@0 713 {
michael@0 714 d = *pd;
michael@0 715 s = *ps;
michael@0 716
michael@0 717 if (s)
michael@0 718 *pd = core_combine_over_u_pixel_sse2 (s, d);
michael@0 719 pd++;
michael@0 720 ps++;
michael@0 721
michael@0 722 w--;
michael@0 723 }
michael@0 724 }
michael@0 725
michael@0 726 static force_inline void
michael@0 727 sse2_combine_over_u (pixman_implementation_t *imp,
michael@0 728 pixman_op_t op,
michael@0 729 uint32_t * pd,
michael@0 730 const uint32_t * ps,
michael@0 731 const uint32_t * pm,
michael@0 732 int w)
michael@0 733 {
michael@0 734 if (pm)
michael@0 735 core_combine_over_u_sse2_mask (pd, ps, pm, w);
michael@0 736 else
michael@0 737 core_combine_over_u_sse2_no_mask (pd, ps, w);
michael@0 738 }
michael@0 739
michael@0 740 static void
michael@0 741 sse2_combine_over_reverse_u (pixman_implementation_t *imp,
michael@0 742 pixman_op_t op,
michael@0 743 uint32_t * pd,
michael@0 744 const uint32_t * ps,
michael@0 745 const uint32_t * pm,
michael@0 746 int w)
michael@0 747 {
michael@0 748 uint32_t s, d;
michael@0 749
michael@0 750 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 751 __m128i xmm_src_lo, xmm_src_hi;
michael@0 752 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 753
michael@0 754 /* Align dst on a 16-byte boundary */
michael@0 755 while (w &&
michael@0 756 ((uintptr_t)pd & 15))
michael@0 757 {
michael@0 758 d = *pd;
michael@0 759 s = combine1 (ps, pm);
michael@0 760
michael@0 761 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
michael@0 762 w--;
michael@0 763 ps++;
michael@0 764 if (pm)
michael@0 765 pm++;
michael@0 766 }
michael@0 767
michael@0 768 while (w >= 4)
michael@0 769 {
michael@0 770 /* I'm loading unaligned because I'm not sure
michael@0 771 * about the address alignment.
michael@0 772 */
michael@0 773 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
michael@0 774 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 775
michael@0 776 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 777 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 778
michael@0 779 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 780 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 781
michael@0 782 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 783 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 784 &xmm_src_lo, &xmm_src_hi);
michael@0 785
michael@0 786 /* rebuid the 4 pixel data and save*/
michael@0 787 save_128_aligned ((__m128i*)pd,
michael@0 788 pack_2x128_128 (xmm_src_lo, xmm_src_hi));
michael@0 789
michael@0 790 w -= 4;
michael@0 791 ps += 4;
michael@0 792 pd += 4;
michael@0 793
michael@0 794 if (pm)
michael@0 795 pm += 4;
michael@0 796 }
michael@0 797
michael@0 798 while (w)
michael@0 799 {
michael@0 800 d = *pd;
michael@0 801 s = combine1 (ps, pm);
michael@0 802
michael@0 803 *pd++ = core_combine_over_u_pixel_sse2 (d, s);
michael@0 804 ps++;
michael@0 805 w--;
michael@0 806 if (pm)
michael@0 807 pm++;
michael@0 808 }
michael@0 809 }
michael@0 810
michael@0 811 static force_inline uint32_t
michael@0 812 core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
michael@0 813 {
michael@0 814 uint32_t maska = src >> 24;
michael@0 815
michael@0 816 if (maska == 0)
michael@0 817 {
michael@0 818 return 0;
michael@0 819 }
michael@0 820 else if (maska != 0xff)
michael@0 821 {
michael@0 822 return pack_1x128_32 (
michael@0 823 pix_multiply_1x128 (unpack_32_1x128 (dst),
michael@0 824 expand_alpha_1x128 (unpack_32_1x128 (src))));
michael@0 825 }
michael@0 826
michael@0 827 return dst;
michael@0 828 }
michael@0 829
michael@0 830 static void
michael@0 831 sse2_combine_in_u (pixman_implementation_t *imp,
michael@0 832 pixman_op_t op,
michael@0 833 uint32_t * pd,
michael@0 834 const uint32_t * ps,
michael@0 835 const uint32_t * pm,
michael@0 836 int w)
michael@0 837 {
michael@0 838 uint32_t s, d;
michael@0 839
michael@0 840 __m128i xmm_src_lo, xmm_src_hi;
michael@0 841 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 842
michael@0 843 while (w && ((uintptr_t)pd & 15))
michael@0 844 {
michael@0 845 s = combine1 (ps, pm);
michael@0 846 d = *pd;
michael@0 847
michael@0 848 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
michael@0 849 w--;
michael@0 850 ps++;
michael@0 851 if (pm)
michael@0 852 pm++;
michael@0 853 }
michael@0 854
michael@0 855 while (w >= 4)
michael@0 856 {
michael@0 857 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 858 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
michael@0 859
michael@0 860 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 861 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 862
michael@0 863 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 864 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 865 &xmm_dst_lo, &xmm_dst_hi,
michael@0 866 &xmm_dst_lo, &xmm_dst_hi);
michael@0 867
michael@0 868 save_128_aligned ((__m128i*)pd,
michael@0 869 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 870
michael@0 871 ps += 4;
michael@0 872 pd += 4;
michael@0 873 w -= 4;
michael@0 874 if (pm)
michael@0 875 pm += 4;
michael@0 876 }
michael@0 877
michael@0 878 while (w)
michael@0 879 {
michael@0 880 s = combine1 (ps, pm);
michael@0 881 d = *pd;
michael@0 882
michael@0 883 *pd++ = core_combine_in_u_pixel_sse2 (d, s);
michael@0 884 w--;
michael@0 885 ps++;
michael@0 886 if (pm)
michael@0 887 pm++;
michael@0 888 }
michael@0 889 }
michael@0 890
michael@0 891 static void
michael@0 892 sse2_combine_in_reverse_u (pixman_implementation_t *imp,
michael@0 893 pixman_op_t op,
michael@0 894 uint32_t * pd,
michael@0 895 const uint32_t * ps,
michael@0 896 const uint32_t * pm,
michael@0 897 int w)
michael@0 898 {
michael@0 899 uint32_t s, d;
michael@0 900
michael@0 901 __m128i xmm_src_lo, xmm_src_hi;
michael@0 902 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 903
michael@0 904 while (w && ((uintptr_t)pd & 15))
michael@0 905 {
michael@0 906 s = combine1 (ps, pm);
michael@0 907 d = *pd;
michael@0 908
michael@0 909 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
michael@0 910 ps++;
michael@0 911 w--;
michael@0 912 if (pm)
michael@0 913 pm++;
michael@0 914 }
michael@0 915
michael@0 916 while (w >= 4)
michael@0 917 {
michael@0 918 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 919 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
michael@0 920
michael@0 921 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 922 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 923
michael@0 924 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 925 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 926 &xmm_src_lo, &xmm_src_hi,
michael@0 927 &xmm_dst_lo, &xmm_dst_hi);
michael@0 928
michael@0 929 save_128_aligned (
michael@0 930 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 931
michael@0 932 ps += 4;
michael@0 933 pd += 4;
michael@0 934 w -= 4;
michael@0 935 if (pm)
michael@0 936 pm += 4;
michael@0 937 }
michael@0 938
michael@0 939 while (w)
michael@0 940 {
michael@0 941 s = combine1 (ps, pm);
michael@0 942 d = *pd;
michael@0 943
michael@0 944 *pd++ = core_combine_in_u_pixel_sse2 (s, d);
michael@0 945 w--;
michael@0 946 ps++;
michael@0 947 if (pm)
michael@0 948 pm++;
michael@0 949 }
michael@0 950 }
michael@0 951
michael@0 952 static void
michael@0 953 sse2_combine_out_reverse_u (pixman_implementation_t *imp,
michael@0 954 pixman_op_t op,
michael@0 955 uint32_t * pd,
michael@0 956 const uint32_t * ps,
michael@0 957 const uint32_t * pm,
michael@0 958 int w)
michael@0 959 {
michael@0 960 while (w && ((uintptr_t)pd & 15))
michael@0 961 {
michael@0 962 uint32_t s = combine1 (ps, pm);
michael@0 963 uint32_t d = *pd;
michael@0 964
michael@0 965 *pd++ = pack_1x128_32 (
michael@0 966 pix_multiply_1x128 (
michael@0 967 unpack_32_1x128 (d), negate_1x128 (
michael@0 968 expand_alpha_1x128 (unpack_32_1x128 (s)))));
michael@0 969
michael@0 970 if (pm)
michael@0 971 pm++;
michael@0 972 ps++;
michael@0 973 w--;
michael@0 974 }
michael@0 975
michael@0 976 while (w >= 4)
michael@0 977 {
michael@0 978 __m128i xmm_src_lo, xmm_src_hi;
michael@0 979 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 980
michael@0 981 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
michael@0 982 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 983
michael@0 984 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 985 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 986
michael@0 987 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 988 negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 989
michael@0 990 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 991 &xmm_src_lo, &xmm_src_hi,
michael@0 992 &xmm_dst_lo, &xmm_dst_hi);
michael@0 993
michael@0 994 save_128_aligned (
michael@0 995 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 996
michael@0 997 ps += 4;
michael@0 998 pd += 4;
michael@0 999 if (pm)
michael@0 1000 pm += 4;
michael@0 1001
michael@0 1002 w -= 4;
michael@0 1003 }
michael@0 1004
michael@0 1005 while (w)
michael@0 1006 {
michael@0 1007 uint32_t s = combine1 (ps, pm);
michael@0 1008 uint32_t d = *pd;
michael@0 1009
michael@0 1010 *pd++ = pack_1x128_32 (
michael@0 1011 pix_multiply_1x128 (
michael@0 1012 unpack_32_1x128 (d), negate_1x128 (
michael@0 1013 expand_alpha_1x128 (unpack_32_1x128 (s)))));
michael@0 1014 ps++;
michael@0 1015 if (pm)
michael@0 1016 pm++;
michael@0 1017 w--;
michael@0 1018 }
michael@0 1019 }
michael@0 1020
michael@0 1021 static void
michael@0 1022 sse2_combine_out_u (pixman_implementation_t *imp,
michael@0 1023 pixman_op_t op,
michael@0 1024 uint32_t * pd,
michael@0 1025 const uint32_t * ps,
michael@0 1026 const uint32_t * pm,
michael@0 1027 int w)
michael@0 1028 {
michael@0 1029 while (w && ((uintptr_t)pd & 15))
michael@0 1030 {
michael@0 1031 uint32_t s = combine1 (ps, pm);
michael@0 1032 uint32_t d = *pd;
michael@0 1033
michael@0 1034 *pd++ = pack_1x128_32 (
michael@0 1035 pix_multiply_1x128 (
michael@0 1036 unpack_32_1x128 (s), negate_1x128 (
michael@0 1037 expand_alpha_1x128 (unpack_32_1x128 (d)))));
michael@0 1038 w--;
michael@0 1039 ps++;
michael@0 1040 if (pm)
michael@0 1041 pm++;
michael@0 1042 }
michael@0 1043
michael@0 1044 while (w >= 4)
michael@0 1045 {
michael@0 1046 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1047 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1048
michael@0 1049 xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
michael@0 1050 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 1051
michael@0 1052 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1053 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1054
michael@0 1055 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1056 negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1057
michael@0 1058 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 1059 &xmm_dst_lo, &xmm_dst_hi,
michael@0 1060 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1061
michael@0 1062 save_128_aligned (
michael@0 1063 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1064
michael@0 1065 ps += 4;
michael@0 1066 pd += 4;
michael@0 1067 w -= 4;
michael@0 1068 if (pm)
michael@0 1069 pm += 4;
michael@0 1070 }
michael@0 1071
michael@0 1072 while (w)
michael@0 1073 {
michael@0 1074 uint32_t s = combine1 (ps, pm);
michael@0 1075 uint32_t d = *pd;
michael@0 1076
michael@0 1077 *pd++ = pack_1x128_32 (
michael@0 1078 pix_multiply_1x128 (
michael@0 1079 unpack_32_1x128 (s), negate_1x128 (
michael@0 1080 expand_alpha_1x128 (unpack_32_1x128 (d)))));
michael@0 1081 w--;
michael@0 1082 ps++;
michael@0 1083 if (pm)
michael@0 1084 pm++;
michael@0 1085 }
michael@0 1086 }
michael@0 1087
michael@0 1088 static force_inline uint32_t
michael@0 1089 core_combine_atop_u_pixel_sse2 (uint32_t src,
michael@0 1090 uint32_t dst)
michael@0 1091 {
michael@0 1092 __m128i s = unpack_32_1x128 (src);
michael@0 1093 __m128i d = unpack_32_1x128 (dst);
michael@0 1094
michael@0 1095 __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
michael@0 1096 __m128i da = expand_alpha_1x128 (d);
michael@0 1097
michael@0 1098 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
michael@0 1099 }
michael@0 1100
michael@0 1101 static void
michael@0 1102 sse2_combine_atop_u (pixman_implementation_t *imp,
michael@0 1103 pixman_op_t op,
michael@0 1104 uint32_t * pd,
michael@0 1105 const uint32_t * ps,
michael@0 1106 const uint32_t * pm,
michael@0 1107 int w)
michael@0 1108 {
michael@0 1109 uint32_t s, d;
michael@0 1110
michael@0 1111 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1112 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1113 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
michael@0 1114 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
michael@0 1115
michael@0 1116 while (w && ((uintptr_t)pd & 15))
michael@0 1117 {
michael@0 1118 s = combine1 (ps, pm);
michael@0 1119 d = *pd;
michael@0 1120
michael@0 1121 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
michael@0 1122 w--;
michael@0 1123 ps++;
michael@0 1124 if (pm)
michael@0 1125 pm++;
michael@0 1126 }
michael@0 1127
michael@0 1128 while (w >= 4)
michael@0 1129 {
michael@0 1130 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
michael@0 1131 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 1132
michael@0 1133 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1134 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1135
michael@0 1136 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 1137 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 1138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 1139 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 1140
michael@0 1141 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
michael@0 1142 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 1143
michael@0 1144 pix_add_multiply_2x128 (
michael@0 1145 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
michael@0 1146 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
michael@0 1147 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1148
michael@0 1149 save_128_aligned (
michael@0 1150 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1151
michael@0 1152 ps += 4;
michael@0 1153 pd += 4;
michael@0 1154 w -= 4;
michael@0 1155 if (pm)
michael@0 1156 pm += 4;
michael@0 1157 }
michael@0 1158
michael@0 1159 while (w)
michael@0 1160 {
michael@0 1161 s = combine1 (ps, pm);
michael@0 1162 d = *pd;
michael@0 1163
michael@0 1164 *pd++ = core_combine_atop_u_pixel_sse2 (s, d);
michael@0 1165 w--;
michael@0 1166 ps++;
michael@0 1167 if (pm)
michael@0 1168 pm++;
michael@0 1169 }
michael@0 1170 }
michael@0 1171
michael@0 1172 static force_inline uint32_t
michael@0 1173 core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
michael@0 1174 uint32_t dst)
michael@0 1175 {
michael@0 1176 __m128i s = unpack_32_1x128 (src);
michael@0 1177 __m128i d = unpack_32_1x128 (dst);
michael@0 1178
michael@0 1179 __m128i sa = expand_alpha_1x128 (s);
michael@0 1180 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
michael@0 1181
michael@0 1182 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
michael@0 1183 }
michael@0 1184
michael@0 1185 static void
michael@0 1186 sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
michael@0 1187 pixman_op_t op,
michael@0 1188 uint32_t * pd,
michael@0 1189 const uint32_t * ps,
michael@0 1190 const uint32_t * pm,
michael@0 1191 int w)
michael@0 1192 {
michael@0 1193 uint32_t s, d;
michael@0 1194
michael@0 1195 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1196 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1197 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
michael@0 1198 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
michael@0 1199
michael@0 1200 while (w && ((uintptr_t)pd & 15))
michael@0 1201 {
michael@0 1202 s = combine1 (ps, pm);
michael@0 1203 d = *pd;
michael@0 1204
michael@0 1205 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
michael@0 1206 ps++;
michael@0 1207 w--;
michael@0 1208 if (pm)
michael@0 1209 pm++;
michael@0 1210 }
michael@0 1211
michael@0 1212 while (w >= 4)
michael@0 1213 {
michael@0 1214 xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
michael@0 1215 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 1216
michael@0 1217 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1218 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1219
michael@0 1220 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 1221 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 1222 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 1223 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 1224
michael@0 1225 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
michael@0 1226 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 1227
michael@0 1228 pix_add_multiply_2x128 (
michael@0 1229 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
michael@0 1230 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
michael@0 1231 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1232
michael@0 1233 save_128_aligned (
michael@0 1234 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1235
michael@0 1236 ps += 4;
michael@0 1237 pd += 4;
michael@0 1238 w -= 4;
michael@0 1239 if (pm)
michael@0 1240 pm += 4;
michael@0 1241 }
michael@0 1242
michael@0 1243 while (w)
michael@0 1244 {
michael@0 1245 s = combine1 (ps, pm);
michael@0 1246 d = *pd;
michael@0 1247
michael@0 1248 *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
michael@0 1249 ps++;
michael@0 1250 w--;
michael@0 1251 if (pm)
michael@0 1252 pm++;
michael@0 1253 }
michael@0 1254 }
michael@0 1255
michael@0 1256 static force_inline uint32_t
michael@0 1257 core_combine_xor_u_pixel_sse2 (uint32_t src,
michael@0 1258 uint32_t dst)
michael@0 1259 {
michael@0 1260 __m128i s = unpack_32_1x128 (src);
michael@0 1261 __m128i d = unpack_32_1x128 (dst);
michael@0 1262
michael@0 1263 __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
michael@0 1264 __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
michael@0 1265
michael@0 1266 return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
michael@0 1267 }
michael@0 1268
michael@0 1269 static void
michael@0 1270 sse2_combine_xor_u (pixman_implementation_t *imp,
michael@0 1271 pixman_op_t op,
michael@0 1272 uint32_t * dst,
michael@0 1273 const uint32_t * src,
michael@0 1274 const uint32_t * mask,
michael@0 1275 int width)
michael@0 1276 {
michael@0 1277 int w = width;
michael@0 1278 uint32_t s, d;
michael@0 1279 uint32_t* pd = dst;
michael@0 1280 const uint32_t* ps = src;
michael@0 1281 const uint32_t* pm = mask;
michael@0 1282
michael@0 1283 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 1284 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 1285 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
michael@0 1286 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
michael@0 1287
michael@0 1288 while (w && ((uintptr_t)pd & 15))
michael@0 1289 {
michael@0 1290 s = combine1 (ps, pm);
michael@0 1291 d = *pd;
michael@0 1292
michael@0 1293 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
michael@0 1294 w--;
michael@0 1295 ps++;
michael@0 1296 if (pm)
michael@0 1297 pm++;
michael@0 1298 }
michael@0 1299
michael@0 1300 while (w >= 4)
michael@0 1301 {
michael@0 1302 xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
michael@0 1303 xmm_dst = load_128_aligned ((__m128i*) pd);
michael@0 1304
michael@0 1305 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 1306 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1307
michael@0 1308 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 1309 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 1310 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 1311 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 1312
michael@0 1313 negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
michael@0 1314 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 1315 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
michael@0 1316 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 1317
michael@0 1318 pix_add_multiply_2x128 (
michael@0 1319 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
michael@0 1320 &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
michael@0 1321 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1322
michael@0 1323 save_128_aligned (
michael@0 1324 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1325
michael@0 1326 ps += 4;
michael@0 1327 pd += 4;
michael@0 1328 w -= 4;
michael@0 1329 if (pm)
michael@0 1330 pm += 4;
michael@0 1331 }
michael@0 1332
michael@0 1333 while (w)
michael@0 1334 {
michael@0 1335 s = combine1 (ps, pm);
michael@0 1336 d = *pd;
michael@0 1337
michael@0 1338 *pd++ = core_combine_xor_u_pixel_sse2 (s, d);
michael@0 1339 w--;
michael@0 1340 ps++;
michael@0 1341 if (pm)
michael@0 1342 pm++;
michael@0 1343 }
michael@0 1344 }
michael@0 1345
michael@0 1346 static force_inline void
michael@0 1347 sse2_combine_add_u (pixman_implementation_t *imp,
michael@0 1348 pixman_op_t op,
michael@0 1349 uint32_t * dst,
michael@0 1350 const uint32_t * src,
michael@0 1351 const uint32_t * mask,
michael@0 1352 int width)
michael@0 1353 {
michael@0 1354 int w = width;
michael@0 1355 uint32_t s, d;
michael@0 1356 uint32_t* pd = dst;
michael@0 1357 const uint32_t* ps = src;
michael@0 1358 const uint32_t* pm = mask;
michael@0 1359
michael@0 1360 while (w && (uintptr_t)pd & 15)
michael@0 1361 {
michael@0 1362 s = combine1 (ps, pm);
michael@0 1363 d = *pd;
michael@0 1364
michael@0 1365 ps++;
michael@0 1366 if (pm)
michael@0 1367 pm++;
michael@0 1368 *pd++ = _mm_cvtsi128_si32 (
michael@0 1369 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
michael@0 1370 w--;
michael@0 1371 }
michael@0 1372
michael@0 1373 while (w >= 4)
michael@0 1374 {
michael@0 1375 __m128i s;
michael@0 1376
michael@0 1377 s = combine4 ((__m128i*)ps, (__m128i*)pm);
michael@0 1378
michael@0 1379 save_128_aligned (
michael@0 1380 (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd)));
michael@0 1381
michael@0 1382 pd += 4;
michael@0 1383 ps += 4;
michael@0 1384 if (pm)
michael@0 1385 pm += 4;
michael@0 1386 w -= 4;
michael@0 1387 }
michael@0 1388
michael@0 1389 while (w--)
michael@0 1390 {
michael@0 1391 s = combine1 (ps, pm);
michael@0 1392 d = *pd;
michael@0 1393
michael@0 1394 ps++;
michael@0 1395 *pd++ = _mm_cvtsi128_si32 (
michael@0 1396 _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
michael@0 1397 if (pm)
michael@0 1398 pm++;
michael@0 1399 }
michael@0 1400 }
michael@0 1401
michael@0 1402 static force_inline uint32_t
michael@0 1403 core_combine_saturate_u_pixel_sse2 (uint32_t src,
michael@0 1404 uint32_t dst)
michael@0 1405 {
michael@0 1406 __m128i ms = unpack_32_1x128 (src);
michael@0 1407 __m128i md = unpack_32_1x128 (dst);
michael@0 1408 uint32_t sa = src >> 24;
michael@0 1409 uint32_t da = ~dst >> 24;
michael@0 1410
michael@0 1411 if (sa > da)
michael@0 1412 {
michael@0 1413 ms = pix_multiply_1x128 (
michael@0 1414 ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
michael@0 1415 }
michael@0 1416
michael@0 1417 return pack_1x128_32 (_mm_adds_epu16 (md, ms));
michael@0 1418 }
michael@0 1419
michael@0 1420 static void
michael@0 1421 sse2_combine_saturate_u (pixman_implementation_t *imp,
michael@0 1422 pixman_op_t op,
michael@0 1423 uint32_t * pd,
michael@0 1424 const uint32_t * ps,
michael@0 1425 const uint32_t * pm,
michael@0 1426 int w)
michael@0 1427 {
michael@0 1428 uint32_t s, d;
michael@0 1429
michael@0 1430 uint32_t pack_cmp;
michael@0 1431 __m128i xmm_src, xmm_dst;
michael@0 1432
michael@0 1433 while (w && (uintptr_t)pd & 15)
michael@0 1434 {
michael@0 1435 s = combine1 (ps, pm);
michael@0 1436 d = *pd;
michael@0 1437
michael@0 1438 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
michael@0 1439 w--;
michael@0 1440 ps++;
michael@0 1441 if (pm)
michael@0 1442 pm++;
michael@0 1443 }
michael@0 1444
michael@0 1445 while (w >= 4)
michael@0 1446 {
michael@0 1447 xmm_dst = load_128_aligned ((__m128i*)pd);
michael@0 1448 xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
michael@0 1449
michael@0 1450 pack_cmp = _mm_movemask_epi8 (
michael@0 1451 _mm_cmpgt_epi32 (
michael@0 1452 _mm_srli_epi32 (xmm_src, 24),
michael@0 1453 _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
michael@0 1454
michael@0 1455 /* if some alpha src is grater than respective ~alpha dst */
michael@0 1456 if (pack_cmp)
michael@0 1457 {
michael@0 1458 s = combine1 (ps++, pm);
michael@0 1459 d = *pd;
michael@0 1460 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
michael@0 1461 if (pm)
michael@0 1462 pm++;
michael@0 1463
michael@0 1464 s = combine1 (ps++, pm);
michael@0 1465 d = *pd;
michael@0 1466 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
michael@0 1467 if (pm)
michael@0 1468 pm++;
michael@0 1469
michael@0 1470 s = combine1 (ps++, pm);
michael@0 1471 d = *pd;
michael@0 1472 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
michael@0 1473 if (pm)
michael@0 1474 pm++;
michael@0 1475
michael@0 1476 s = combine1 (ps++, pm);
michael@0 1477 d = *pd;
michael@0 1478 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
michael@0 1479 if (pm)
michael@0 1480 pm++;
michael@0 1481 }
michael@0 1482 else
michael@0 1483 {
michael@0 1484 save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
michael@0 1485
michael@0 1486 pd += 4;
michael@0 1487 ps += 4;
michael@0 1488 if (pm)
michael@0 1489 pm += 4;
michael@0 1490 }
michael@0 1491
michael@0 1492 w -= 4;
michael@0 1493 }
michael@0 1494
michael@0 1495 while (w--)
michael@0 1496 {
michael@0 1497 s = combine1 (ps, pm);
michael@0 1498 d = *pd;
michael@0 1499
michael@0 1500 *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
michael@0 1501 ps++;
michael@0 1502 if (pm)
michael@0 1503 pm++;
michael@0 1504 }
michael@0 1505 }
michael@0 1506
michael@0 1507 static void
michael@0 1508 sse2_combine_src_ca (pixman_implementation_t *imp,
michael@0 1509 pixman_op_t op,
michael@0 1510 uint32_t * pd,
michael@0 1511 const uint32_t * ps,
michael@0 1512 const uint32_t * pm,
michael@0 1513 int w)
michael@0 1514 {
michael@0 1515 uint32_t s, m;
michael@0 1516
michael@0 1517 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1518 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1519 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1520
michael@0 1521 while (w && (uintptr_t)pd & 15)
michael@0 1522 {
michael@0 1523 s = *ps++;
michael@0 1524 m = *pm++;
michael@0 1525 *pd++ = pack_1x128_32 (
michael@0 1526 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
michael@0 1527 w--;
michael@0 1528 }
michael@0 1529
michael@0 1530 while (w >= 4)
michael@0 1531 {
michael@0 1532 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1533 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1534
michael@0 1535 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1536 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1537
michael@0 1538 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 1539 &xmm_mask_lo, &xmm_mask_hi,
michael@0 1540 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1541
michael@0 1542 save_128_aligned (
michael@0 1543 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1544
michael@0 1545 ps += 4;
michael@0 1546 pd += 4;
michael@0 1547 pm += 4;
michael@0 1548 w -= 4;
michael@0 1549 }
michael@0 1550
michael@0 1551 while (w)
michael@0 1552 {
michael@0 1553 s = *ps++;
michael@0 1554 m = *pm++;
michael@0 1555 *pd++ = pack_1x128_32 (
michael@0 1556 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
michael@0 1557 w--;
michael@0 1558 }
michael@0 1559 }
michael@0 1560
michael@0 1561 static force_inline uint32_t
michael@0 1562 core_combine_over_ca_pixel_sse2 (uint32_t src,
michael@0 1563 uint32_t mask,
michael@0 1564 uint32_t dst)
michael@0 1565 {
michael@0 1566 __m128i s = unpack_32_1x128 (src);
michael@0 1567 __m128i expAlpha = expand_alpha_1x128 (s);
michael@0 1568 __m128i unpk_mask = unpack_32_1x128 (mask);
michael@0 1569 __m128i unpk_dst = unpack_32_1x128 (dst);
michael@0 1570
michael@0 1571 return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
michael@0 1572 }
michael@0 1573
michael@0 1574 static void
michael@0 1575 sse2_combine_over_ca (pixman_implementation_t *imp,
michael@0 1576 pixman_op_t op,
michael@0 1577 uint32_t * pd,
michael@0 1578 const uint32_t * ps,
michael@0 1579 const uint32_t * pm,
michael@0 1580 int w)
michael@0 1581 {
michael@0 1582 uint32_t s, m, d;
michael@0 1583
michael@0 1584 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 1585 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1586 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1587 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1588
michael@0 1589 while (w && (uintptr_t)pd & 15)
michael@0 1590 {
michael@0 1591 s = *ps++;
michael@0 1592 m = *pm++;
michael@0 1593 d = *pd;
michael@0 1594
michael@0 1595 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
michael@0 1596 w--;
michael@0 1597 }
michael@0 1598
michael@0 1599 while (w >= 4)
michael@0 1600 {
michael@0 1601 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 1602 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1603 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1604
michael@0 1605 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1606 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1607 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1608
michael@0 1609 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 1610 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1611
michael@0 1612 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 1613 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1614 &xmm_mask_lo, &xmm_mask_hi,
michael@0 1615 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1616
michael@0 1617 save_128_aligned (
michael@0 1618 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1619
michael@0 1620 ps += 4;
michael@0 1621 pd += 4;
michael@0 1622 pm += 4;
michael@0 1623 w -= 4;
michael@0 1624 }
michael@0 1625
michael@0 1626 while (w)
michael@0 1627 {
michael@0 1628 s = *ps++;
michael@0 1629 m = *pm++;
michael@0 1630 d = *pd;
michael@0 1631
michael@0 1632 *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
michael@0 1633 w--;
michael@0 1634 }
michael@0 1635 }
michael@0 1636
michael@0 1637 static force_inline uint32_t
michael@0 1638 core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
michael@0 1639 uint32_t mask,
michael@0 1640 uint32_t dst)
michael@0 1641 {
michael@0 1642 __m128i d = unpack_32_1x128 (dst);
michael@0 1643
michael@0 1644 return pack_1x128_32 (
michael@0 1645 over_1x128 (d, expand_alpha_1x128 (d),
michael@0 1646 pix_multiply_1x128 (unpack_32_1x128 (src),
michael@0 1647 unpack_32_1x128 (mask))));
michael@0 1648 }
michael@0 1649
michael@0 1650 static void
michael@0 1651 sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
michael@0 1652 pixman_op_t op,
michael@0 1653 uint32_t * pd,
michael@0 1654 const uint32_t * ps,
michael@0 1655 const uint32_t * pm,
michael@0 1656 int w)
michael@0 1657 {
michael@0 1658 uint32_t s, m, d;
michael@0 1659
michael@0 1660 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 1661 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1662 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1663 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1664
michael@0 1665 while (w && (uintptr_t)pd & 15)
michael@0 1666 {
michael@0 1667 s = *ps++;
michael@0 1668 m = *pm++;
michael@0 1669 d = *pd;
michael@0 1670
michael@0 1671 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
michael@0 1672 w--;
michael@0 1673 }
michael@0 1674
michael@0 1675 while (w >= 4)
michael@0 1676 {
michael@0 1677 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 1678 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1679 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1680
michael@0 1681 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1682 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1683 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1684
michael@0 1685 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 1686 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1687 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 1688 &xmm_mask_lo, &xmm_mask_hi,
michael@0 1689 &xmm_mask_lo, &xmm_mask_hi);
michael@0 1690
michael@0 1691 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 1692 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1693 &xmm_mask_lo, &xmm_mask_hi);
michael@0 1694
michael@0 1695 save_128_aligned (
michael@0 1696 (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
michael@0 1697
michael@0 1698 ps += 4;
michael@0 1699 pd += 4;
michael@0 1700 pm += 4;
michael@0 1701 w -= 4;
michael@0 1702 }
michael@0 1703
michael@0 1704 while (w)
michael@0 1705 {
michael@0 1706 s = *ps++;
michael@0 1707 m = *pm++;
michael@0 1708 d = *pd;
michael@0 1709
michael@0 1710 *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
michael@0 1711 w--;
michael@0 1712 }
michael@0 1713 }
michael@0 1714
michael@0 1715 static void
michael@0 1716 sse2_combine_in_ca (pixman_implementation_t *imp,
michael@0 1717 pixman_op_t op,
michael@0 1718 uint32_t * pd,
michael@0 1719 const uint32_t * ps,
michael@0 1720 const uint32_t * pm,
michael@0 1721 int w)
michael@0 1722 {
michael@0 1723 uint32_t s, m, d;
michael@0 1724
michael@0 1725 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 1726 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1727 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1728 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1729
michael@0 1730 while (w && (uintptr_t)pd & 15)
michael@0 1731 {
michael@0 1732 s = *ps++;
michael@0 1733 m = *pm++;
michael@0 1734 d = *pd;
michael@0 1735
michael@0 1736 *pd++ = pack_1x128_32 (
michael@0 1737 pix_multiply_1x128 (
michael@0 1738 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
michael@0 1739 expand_alpha_1x128 (unpack_32_1x128 (d))));
michael@0 1740
michael@0 1741 w--;
michael@0 1742 }
michael@0 1743
michael@0 1744 while (w >= 4)
michael@0 1745 {
michael@0 1746 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 1747 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1748 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1749
michael@0 1750 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1751 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1752 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1753
michael@0 1754 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 1755 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1756
michael@0 1757 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 1758 &xmm_mask_lo, &xmm_mask_hi,
michael@0 1759 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1760
michael@0 1761 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 1762 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1763 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1764
michael@0 1765 save_128_aligned (
michael@0 1766 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1767
michael@0 1768 ps += 4;
michael@0 1769 pd += 4;
michael@0 1770 pm += 4;
michael@0 1771 w -= 4;
michael@0 1772 }
michael@0 1773
michael@0 1774 while (w)
michael@0 1775 {
michael@0 1776 s = *ps++;
michael@0 1777 m = *pm++;
michael@0 1778 d = *pd;
michael@0 1779
michael@0 1780 *pd++ = pack_1x128_32 (
michael@0 1781 pix_multiply_1x128 (
michael@0 1782 pix_multiply_1x128 (
michael@0 1783 unpack_32_1x128 (s), unpack_32_1x128 (m)),
michael@0 1784 expand_alpha_1x128 (unpack_32_1x128 (d))));
michael@0 1785
michael@0 1786 w--;
michael@0 1787 }
michael@0 1788 }
michael@0 1789
michael@0 1790 static void
michael@0 1791 sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
michael@0 1792 pixman_op_t op,
michael@0 1793 uint32_t * pd,
michael@0 1794 const uint32_t * ps,
michael@0 1795 const uint32_t * pm,
michael@0 1796 int w)
michael@0 1797 {
michael@0 1798 uint32_t s, m, d;
michael@0 1799
michael@0 1800 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 1801 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1802 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1803 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1804
michael@0 1805 while (w && (uintptr_t)pd & 15)
michael@0 1806 {
michael@0 1807 s = *ps++;
michael@0 1808 m = *pm++;
michael@0 1809 d = *pd;
michael@0 1810
michael@0 1811 *pd++ = pack_1x128_32 (
michael@0 1812 pix_multiply_1x128 (
michael@0 1813 unpack_32_1x128 (d),
michael@0 1814 pix_multiply_1x128 (unpack_32_1x128 (m),
michael@0 1815 expand_alpha_1x128 (unpack_32_1x128 (s)))));
michael@0 1816 w--;
michael@0 1817 }
michael@0 1818
michael@0 1819 while (w >= 4)
michael@0 1820 {
michael@0 1821 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 1822 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1823 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1824
michael@0 1825 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1826 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1827 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1828
michael@0 1829 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 1830 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1831 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
michael@0 1832 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1833 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1834
michael@0 1835 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 1836 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1837 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1838
michael@0 1839 save_128_aligned (
michael@0 1840 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1841
michael@0 1842 ps += 4;
michael@0 1843 pd += 4;
michael@0 1844 pm += 4;
michael@0 1845 w -= 4;
michael@0 1846 }
michael@0 1847
michael@0 1848 while (w)
michael@0 1849 {
michael@0 1850 s = *ps++;
michael@0 1851 m = *pm++;
michael@0 1852 d = *pd;
michael@0 1853
michael@0 1854 *pd++ = pack_1x128_32 (
michael@0 1855 pix_multiply_1x128 (
michael@0 1856 unpack_32_1x128 (d),
michael@0 1857 pix_multiply_1x128 (unpack_32_1x128 (m),
michael@0 1858 expand_alpha_1x128 (unpack_32_1x128 (s)))));
michael@0 1859 w--;
michael@0 1860 }
michael@0 1861 }
michael@0 1862
michael@0 1863 static void
michael@0 1864 sse2_combine_out_ca (pixman_implementation_t *imp,
michael@0 1865 pixman_op_t op,
michael@0 1866 uint32_t * pd,
michael@0 1867 const uint32_t * ps,
michael@0 1868 const uint32_t * pm,
michael@0 1869 int w)
michael@0 1870 {
michael@0 1871 uint32_t s, m, d;
michael@0 1872
michael@0 1873 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 1874 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1875 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1876 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1877
michael@0 1878 while (w && (uintptr_t)pd & 15)
michael@0 1879 {
michael@0 1880 s = *ps++;
michael@0 1881 m = *pm++;
michael@0 1882 d = *pd;
michael@0 1883
michael@0 1884 *pd++ = pack_1x128_32 (
michael@0 1885 pix_multiply_1x128 (
michael@0 1886 pix_multiply_1x128 (
michael@0 1887 unpack_32_1x128 (s), unpack_32_1x128 (m)),
michael@0 1888 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
michael@0 1889 w--;
michael@0 1890 }
michael@0 1891
michael@0 1892 while (w >= 4)
michael@0 1893 {
michael@0 1894 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 1895 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1896 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1897
michael@0 1898 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1899 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1900 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1901
michael@0 1902 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 1903 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1904 negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
michael@0 1905 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1906
michael@0 1907 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 1908 &xmm_mask_lo, &xmm_mask_hi,
michael@0 1909 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1910 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 1911 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1912 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1913
michael@0 1914 save_128_aligned (
michael@0 1915 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1916
michael@0 1917 ps += 4;
michael@0 1918 pd += 4;
michael@0 1919 pm += 4;
michael@0 1920 w -= 4;
michael@0 1921 }
michael@0 1922
michael@0 1923 while (w)
michael@0 1924 {
michael@0 1925 s = *ps++;
michael@0 1926 m = *pm++;
michael@0 1927 d = *pd;
michael@0 1928
michael@0 1929 *pd++ = pack_1x128_32 (
michael@0 1930 pix_multiply_1x128 (
michael@0 1931 pix_multiply_1x128 (
michael@0 1932 unpack_32_1x128 (s), unpack_32_1x128 (m)),
michael@0 1933 negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
michael@0 1934
michael@0 1935 w--;
michael@0 1936 }
michael@0 1937 }
michael@0 1938
michael@0 1939 static void
michael@0 1940 sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
michael@0 1941 pixman_op_t op,
michael@0 1942 uint32_t * pd,
michael@0 1943 const uint32_t * ps,
michael@0 1944 const uint32_t * pm,
michael@0 1945 int w)
michael@0 1946 {
michael@0 1947 uint32_t s, m, d;
michael@0 1948
michael@0 1949 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 1950 __m128i xmm_src_lo, xmm_src_hi;
michael@0 1951 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 1952 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 1953
michael@0 1954 while (w && (uintptr_t)pd & 15)
michael@0 1955 {
michael@0 1956 s = *ps++;
michael@0 1957 m = *pm++;
michael@0 1958 d = *pd;
michael@0 1959
michael@0 1960 *pd++ = pack_1x128_32 (
michael@0 1961 pix_multiply_1x128 (
michael@0 1962 unpack_32_1x128 (d),
michael@0 1963 negate_1x128 (pix_multiply_1x128 (
michael@0 1964 unpack_32_1x128 (m),
michael@0 1965 expand_alpha_1x128 (unpack_32_1x128 (s))))));
michael@0 1966 w--;
michael@0 1967 }
michael@0 1968
michael@0 1969 while (w >= 4)
michael@0 1970 {
michael@0 1971 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 1972 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 1973 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 1974
michael@0 1975 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 1976 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 1977 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 1978
michael@0 1979 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 1980 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 1981
michael@0 1982 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
michael@0 1983 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 1984 &xmm_mask_lo, &xmm_mask_hi);
michael@0 1985
michael@0 1986 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 1987 &xmm_mask_lo, &xmm_mask_hi);
michael@0 1988
michael@0 1989 pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 1990 &xmm_mask_lo, &xmm_mask_hi,
michael@0 1991 &xmm_dst_lo, &xmm_dst_hi);
michael@0 1992
michael@0 1993 save_128_aligned (
michael@0 1994 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 1995
michael@0 1996 ps += 4;
michael@0 1997 pd += 4;
michael@0 1998 pm += 4;
michael@0 1999 w -= 4;
michael@0 2000 }
michael@0 2001
michael@0 2002 while (w)
michael@0 2003 {
michael@0 2004 s = *ps++;
michael@0 2005 m = *pm++;
michael@0 2006 d = *pd;
michael@0 2007
michael@0 2008 *pd++ = pack_1x128_32 (
michael@0 2009 pix_multiply_1x128 (
michael@0 2010 unpack_32_1x128 (d),
michael@0 2011 negate_1x128 (pix_multiply_1x128 (
michael@0 2012 unpack_32_1x128 (m),
michael@0 2013 expand_alpha_1x128 (unpack_32_1x128 (s))))));
michael@0 2014 w--;
michael@0 2015 }
michael@0 2016 }
michael@0 2017
michael@0 2018 static force_inline uint32_t
michael@0 2019 core_combine_atop_ca_pixel_sse2 (uint32_t src,
michael@0 2020 uint32_t mask,
michael@0 2021 uint32_t dst)
michael@0 2022 {
michael@0 2023 __m128i m = unpack_32_1x128 (mask);
michael@0 2024 __m128i s = unpack_32_1x128 (src);
michael@0 2025 __m128i d = unpack_32_1x128 (dst);
michael@0 2026 __m128i sa = expand_alpha_1x128 (s);
michael@0 2027 __m128i da = expand_alpha_1x128 (d);
michael@0 2028
michael@0 2029 s = pix_multiply_1x128 (s, m);
michael@0 2030 m = negate_1x128 (pix_multiply_1x128 (m, sa));
michael@0 2031
michael@0 2032 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
michael@0 2033 }
michael@0 2034
michael@0 2035 static void
michael@0 2036 sse2_combine_atop_ca (pixman_implementation_t *imp,
michael@0 2037 pixman_op_t op,
michael@0 2038 uint32_t * pd,
michael@0 2039 const uint32_t * ps,
michael@0 2040 const uint32_t * pm,
michael@0 2041 int w)
michael@0 2042 {
michael@0 2043 uint32_t s, m, d;
michael@0 2044
michael@0 2045 __m128i xmm_src_lo, xmm_src_hi;
michael@0 2046 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 2047 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
michael@0 2048 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
michael@0 2049 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 2050
michael@0 2051 while (w && (uintptr_t)pd & 15)
michael@0 2052 {
michael@0 2053 s = *ps++;
michael@0 2054 m = *pm++;
michael@0 2055 d = *pd;
michael@0 2056
michael@0 2057 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
michael@0 2058 w--;
michael@0 2059 }
michael@0 2060
michael@0 2061 while (w >= 4)
michael@0 2062 {
michael@0 2063 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 2064 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 2065 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 2066
michael@0 2067 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2068 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 2069 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2070
michael@0 2071 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 2072 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 2073 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 2074 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 2075
michael@0 2076 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 2077 &xmm_mask_lo, &xmm_mask_hi,
michael@0 2078 &xmm_src_lo, &xmm_src_hi);
michael@0 2079 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
michael@0 2080 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
michael@0 2081 &xmm_mask_lo, &xmm_mask_hi);
michael@0 2082
michael@0 2083 negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2084
michael@0 2085 pix_add_multiply_2x128 (
michael@0 2086 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
michael@0 2087 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
michael@0 2088 &xmm_dst_lo, &xmm_dst_hi);
michael@0 2089
michael@0 2090 save_128_aligned (
michael@0 2091 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 2092
michael@0 2093 ps += 4;
michael@0 2094 pd += 4;
michael@0 2095 pm += 4;
michael@0 2096 w -= 4;
michael@0 2097 }
michael@0 2098
michael@0 2099 while (w)
michael@0 2100 {
michael@0 2101 s = *ps++;
michael@0 2102 m = *pm++;
michael@0 2103 d = *pd;
michael@0 2104
michael@0 2105 *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
michael@0 2106 w--;
michael@0 2107 }
michael@0 2108 }
michael@0 2109
michael@0 2110 static force_inline uint32_t
michael@0 2111 core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
michael@0 2112 uint32_t mask,
michael@0 2113 uint32_t dst)
michael@0 2114 {
michael@0 2115 __m128i m = unpack_32_1x128 (mask);
michael@0 2116 __m128i s = unpack_32_1x128 (src);
michael@0 2117 __m128i d = unpack_32_1x128 (dst);
michael@0 2118
michael@0 2119 __m128i da = negate_1x128 (expand_alpha_1x128 (d));
michael@0 2120 __m128i sa = expand_alpha_1x128 (s);
michael@0 2121
michael@0 2122 s = pix_multiply_1x128 (s, m);
michael@0 2123 m = pix_multiply_1x128 (m, sa);
michael@0 2124
michael@0 2125 return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
michael@0 2126 }
michael@0 2127
michael@0 2128 static void
michael@0 2129 sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
michael@0 2130 pixman_op_t op,
michael@0 2131 uint32_t * pd,
michael@0 2132 const uint32_t * ps,
michael@0 2133 const uint32_t * pm,
michael@0 2134 int w)
michael@0 2135 {
michael@0 2136 uint32_t s, m, d;
michael@0 2137
michael@0 2138 __m128i xmm_src_lo, xmm_src_hi;
michael@0 2139 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 2140 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
michael@0 2141 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
michael@0 2142 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 2143
michael@0 2144 while (w && (uintptr_t)pd & 15)
michael@0 2145 {
michael@0 2146 s = *ps++;
michael@0 2147 m = *pm++;
michael@0 2148 d = *pd;
michael@0 2149
michael@0 2150 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
michael@0 2151 w--;
michael@0 2152 }
michael@0 2153
michael@0 2154 while (w >= 4)
michael@0 2155 {
michael@0 2156 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 2157 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 2158 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 2159
michael@0 2160 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2161 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 2162 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2163
michael@0 2164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 2165 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 2166 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 2167 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 2168
michael@0 2169 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 2170 &xmm_mask_lo, &xmm_mask_hi,
michael@0 2171 &xmm_src_lo, &xmm_src_hi);
michael@0 2172 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
michael@0 2173 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
michael@0 2174 &xmm_mask_lo, &xmm_mask_hi);
michael@0 2175
michael@0 2176 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
michael@0 2177 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 2178
michael@0 2179 pix_add_multiply_2x128 (
michael@0 2180 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
michael@0 2181 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
michael@0 2182 &xmm_dst_lo, &xmm_dst_hi);
michael@0 2183
michael@0 2184 save_128_aligned (
michael@0 2185 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 2186
michael@0 2187 ps += 4;
michael@0 2188 pd += 4;
michael@0 2189 pm += 4;
michael@0 2190 w -= 4;
michael@0 2191 }
michael@0 2192
michael@0 2193 while (w)
michael@0 2194 {
michael@0 2195 s = *ps++;
michael@0 2196 m = *pm++;
michael@0 2197 d = *pd;
michael@0 2198
michael@0 2199 *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
michael@0 2200 w--;
michael@0 2201 }
michael@0 2202 }
michael@0 2203
michael@0 2204 static force_inline uint32_t
michael@0 2205 core_combine_xor_ca_pixel_sse2 (uint32_t src,
michael@0 2206 uint32_t mask,
michael@0 2207 uint32_t dst)
michael@0 2208 {
michael@0 2209 __m128i a = unpack_32_1x128 (mask);
michael@0 2210 __m128i s = unpack_32_1x128 (src);
michael@0 2211 __m128i d = unpack_32_1x128 (dst);
michael@0 2212
michael@0 2213 __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
michael@0 2214 a, expand_alpha_1x128 (s)));
michael@0 2215 __m128i dest = pix_multiply_1x128 (s, a);
michael@0 2216 __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
michael@0 2217
michael@0 2218 return pack_1x128_32 (pix_add_multiply_1x128 (&d,
michael@0 2219 &alpha_dst,
michael@0 2220 &dest,
michael@0 2221 &alpha_src));
michael@0 2222 }
michael@0 2223
michael@0 2224 static void
michael@0 2225 sse2_combine_xor_ca (pixman_implementation_t *imp,
michael@0 2226 pixman_op_t op,
michael@0 2227 uint32_t * pd,
michael@0 2228 const uint32_t * ps,
michael@0 2229 const uint32_t * pm,
michael@0 2230 int w)
michael@0 2231 {
michael@0 2232 uint32_t s, m, d;
michael@0 2233
michael@0 2234 __m128i xmm_src_lo, xmm_src_hi;
michael@0 2235 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 2236 __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
michael@0 2237 __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
michael@0 2238 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 2239
michael@0 2240 while (w && (uintptr_t)pd & 15)
michael@0 2241 {
michael@0 2242 s = *ps++;
michael@0 2243 m = *pm++;
michael@0 2244 d = *pd;
michael@0 2245
michael@0 2246 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
michael@0 2247 w--;
michael@0 2248 }
michael@0 2249
michael@0 2250 while (w >= 4)
michael@0 2251 {
michael@0 2252 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 2253 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 2254 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 2255
michael@0 2256 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2257 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 2258 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2259
michael@0 2260 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 2261 &xmm_alpha_src_lo, &xmm_alpha_src_hi);
michael@0 2262 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
michael@0 2263 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 2264
michael@0 2265 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 2266 &xmm_mask_lo, &xmm_mask_hi,
michael@0 2267 &xmm_src_lo, &xmm_src_hi);
michael@0 2268 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
michael@0 2269 &xmm_alpha_src_lo, &xmm_alpha_src_hi,
michael@0 2270 &xmm_mask_lo, &xmm_mask_hi);
michael@0 2271
michael@0 2272 negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
michael@0 2273 &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
michael@0 2274 negate_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 2275 &xmm_mask_lo, &xmm_mask_hi);
michael@0 2276
michael@0 2277 pix_add_multiply_2x128 (
michael@0 2278 &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
michael@0 2279 &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
michael@0 2280 &xmm_dst_lo, &xmm_dst_hi);
michael@0 2281
michael@0 2282 save_128_aligned (
michael@0 2283 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 2284
michael@0 2285 ps += 4;
michael@0 2286 pd += 4;
michael@0 2287 pm += 4;
michael@0 2288 w -= 4;
michael@0 2289 }
michael@0 2290
michael@0 2291 while (w)
michael@0 2292 {
michael@0 2293 s = *ps++;
michael@0 2294 m = *pm++;
michael@0 2295 d = *pd;
michael@0 2296
michael@0 2297 *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
michael@0 2298 w--;
michael@0 2299 }
michael@0 2300 }
michael@0 2301
michael@0 2302 static void
michael@0 2303 sse2_combine_add_ca (pixman_implementation_t *imp,
michael@0 2304 pixman_op_t op,
michael@0 2305 uint32_t * pd,
michael@0 2306 const uint32_t * ps,
michael@0 2307 const uint32_t * pm,
michael@0 2308 int w)
michael@0 2309 {
michael@0 2310 uint32_t s, m, d;
michael@0 2311
michael@0 2312 __m128i xmm_src_lo, xmm_src_hi;
michael@0 2313 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 2314 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 2315
michael@0 2316 while (w && (uintptr_t)pd & 15)
michael@0 2317 {
michael@0 2318 s = *ps++;
michael@0 2319 m = *pm++;
michael@0 2320 d = *pd;
michael@0 2321
michael@0 2322 *pd++ = pack_1x128_32 (
michael@0 2323 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
michael@0 2324 unpack_32_1x128 (m)),
michael@0 2325 unpack_32_1x128 (d)));
michael@0 2326 w--;
michael@0 2327 }
michael@0 2328
michael@0 2329 while (w >= 4)
michael@0 2330 {
michael@0 2331 xmm_src_hi = load_128_unaligned ((__m128i*)ps);
michael@0 2332 xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
michael@0 2333 xmm_dst_hi = load_128_aligned ((__m128i*)pd);
michael@0 2334
michael@0 2335 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 2336 unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2337 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2338
michael@0 2339 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 2340 &xmm_mask_lo, &xmm_mask_hi,
michael@0 2341 &xmm_src_lo, &xmm_src_hi);
michael@0 2342
michael@0 2343 save_128_aligned (
michael@0 2344 (__m128i*)pd, pack_2x128_128 (
michael@0 2345 _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
michael@0 2346 _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
michael@0 2347
michael@0 2348 ps += 4;
michael@0 2349 pd += 4;
michael@0 2350 pm += 4;
michael@0 2351 w -= 4;
michael@0 2352 }
michael@0 2353
michael@0 2354 while (w)
michael@0 2355 {
michael@0 2356 s = *ps++;
michael@0 2357 m = *pm++;
michael@0 2358 d = *pd;
michael@0 2359
michael@0 2360 *pd++ = pack_1x128_32 (
michael@0 2361 _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
michael@0 2362 unpack_32_1x128 (m)),
michael@0 2363 unpack_32_1x128 (d)));
michael@0 2364 w--;
michael@0 2365 }
michael@0 2366 }
michael@0 2367
michael@0 2368 static force_inline __m128i
michael@0 2369 create_mask_16_128 (uint16_t mask)
michael@0 2370 {
michael@0 2371 return _mm_set1_epi16 (mask);
michael@0 2372 }
michael@0 2373
michael@0 2374 /* Work around a code generation bug in Sun Studio 12. */
michael@0 2375 #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
michael@0 2376 # define create_mask_2x32_128(mask0, mask1) \
michael@0 2377 (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
michael@0 2378 #else
michael@0 2379 static force_inline __m128i
michael@0 2380 create_mask_2x32_128 (uint32_t mask0,
michael@0 2381 uint32_t mask1)
michael@0 2382 {
michael@0 2383 return _mm_set_epi32 (mask0, mask1, mask0, mask1);
michael@0 2384 }
michael@0 2385 #endif
michael@0 2386
michael@0 2387 static void
michael@0 2388 sse2_composite_over_n_8888 (pixman_implementation_t *imp,
michael@0 2389 pixman_composite_info_t *info)
michael@0 2390 {
michael@0 2391 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2392 uint32_t src;
michael@0 2393 uint32_t *dst_line, *dst, d;
michael@0 2394 int32_t w;
michael@0 2395 int dst_stride;
michael@0 2396 __m128i xmm_src, xmm_alpha;
michael@0 2397 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 2398
michael@0 2399 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2400
michael@0 2401 if (src == 0)
michael@0 2402 return;
michael@0 2403
michael@0 2404 PIXMAN_IMAGE_GET_LINE (
michael@0 2405 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2406
michael@0 2407 xmm_src = expand_pixel_32_1x128 (src);
michael@0 2408 xmm_alpha = expand_alpha_1x128 (xmm_src);
michael@0 2409
michael@0 2410 while (height--)
michael@0 2411 {
michael@0 2412 dst = dst_line;
michael@0 2413
michael@0 2414 dst_line += dst_stride;
michael@0 2415 w = width;
michael@0 2416
michael@0 2417 while (w && (uintptr_t)dst & 15)
michael@0 2418 {
michael@0 2419 d = *dst;
michael@0 2420 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
michael@0 2421 xmm_alpha,
michael@0 2422 unpack_32_1x128 (d)));
michael@0 2423 w--;
michael@0 2424 }
michael@0 2425
michael@0 2426 while (w >= 4)
michael@0 2427 {
michael@0 2428 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 2429
michael@0 2430 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2431
michael@0 2432 over_2x128 (&xmm_src, &xmm_src,
michael@0 2433 &xmm_alpha, &xmm_alpha,
michael@0 2434 &xmm_dst_lo, &xmm_dst_hi);
michael@0 2435
michael@0 2436 /* rebuid the 4 pixel data and save*/
michael@0 2437 save_128_aligned (
michael@0 2438 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 2439
michael@0 2440 w -= 4;
michael@0 2441 dst += 4;
michael@0 2442 }
michael@0 2443
michael@0 2444 while (w)
michael@0 2445 {
michael@0 2446 d = *dst;
michael@0 2447 *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
michael@0 2448 xmm_alpha,
michael@0 2449 unpack_32_1x128 (d)));
michael@0 2450 w--;
michael@0 2451 }
michael@0 2452
michael@0 2453 }
michael@0 2454 }
michael@0 2455
michael@0 2456 static void
michael@0 2457 sse2_composite_over_n_0565 (pixman_implementation_t *imp,
michael@0 2458 pixman_composite_info_t *info)
michael@0 2459 {
michael@0 2460 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2461 uint32_t src;
michael@0 2462 uint16_t *dst_line, *dst, d;
michael@0 2463 int32_t w;
michael@0 2464 int dst_stride;
michael@0 2465 __m128i xmm_src, xmm_alpha;
michael@0 2466 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
michael@0 2467
michael@0 2468 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2469
michael@0 2470 if (src == 0)
michael@0 2471 return;
michael@0 2472
michael@0 2473 PIXMAN_IMAGE_GET_LINE (
michael@0 2474 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 2475
michael@0 2476 xmm_src = expand_pixel_32_1x128 (src);
michael@0 2477 xmm_alpha = expand_alpha_1x128 (xmm_src);
michael@0 2478
michael@0 2479 while (height--)
michael@0 2480 {
michael@0 2481 dst = dst_line;
michael@0 2482
michael@0 2483 dst_line += dst_stride;
michael@0 2484 w = width;
michael@0 2485
michael@0 2486 while (w && (uintptr_t)dst & 15)
michael@0 2487 {
michael@0 2488 d = *dst;
michael@0 2489
michael@0 2490 *dst++ = pack_565_32_16 (
michael@0 2491 pack_1x128_32 (over_1x128 (xmm_src,
michael@0 2492 xmm_alpha,
michael@0 2493 expand565_16_1x128 (d))));
michael@0 2494 w--;
michael@0 2495 }
michael@0 2496
michael@0 2497 while (w >= 8)
michael@0 2498 {
michael@0 2499 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 2500
michael@0 2501 unpack_565_128_4x128 (xmm_dst,
michael@0 2502 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
michael@0 2503
michael@0 2504 over_2x128 (&xmm_src, &xmm_src,
michael@0 2505 &xmm_alpha, &xmm_alpha,
michael@0 2506 &xmm_dst0, &xmm_dst1);
michael@0 2507 over_2x128 (&xmm_src, &xmm_src,
michael@0 2508 &xmm_alpha, &xmm_alpha,
michael@0 2509 &xmm_dst2, &xmm_dst3);
michael@0 2510
michael@0 2511 xmm_dst = pack_565_4x128_128 (
michael@0 2512 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
michael@0 2513
michael@0 2514 save_128_aligned ((__m128i*)dst, xmm_dst);
michael@0 2515
michael@0 2516 dst += 8;
michael@0 2517 w -= 8;
michael@0 2518 }
michael@0 2519
michael@0 2520 while (w--)
michael@0 2521 {
michael@0 2522 d = *dst;
michael@0 2523 *dst++ = pack_565_32_16 (
michael@0 2524 pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
michael@0 2525 expand565_16_1x128 (d))));
michael@0 2526 }
michael@0 2527 }
michael@0 2528
michael@0 2529 }
michael@0 2530
michael@0 2531 static void
michael@0 2532 sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
michael@0 2533 pixman_composite_info_t *info)
michael@0 2534 {
michael@0 2535 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2536 uint32_t src;
michael@0 2537 uint32_t *dst_line, d;
michael@0 2538 uint32_t *mask_line, m;
michael@0 2539 uint32_t pack_cmp;
michael@0 2540 int dst_stride, mask_stride;
michael@0 2541
michael@0 2542 __m128i xmm_src;
michael@0 2543 __m128i xmm_dst;
michael@0 2544 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 2545
michael@0 2546 __m128i mmx_src, mmx_mask, mmx_dest;
michael@0 2547
michael@0 2548 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2549
michael@0 2550 if (src == 0)
michael@0 2551 return;
michael@0 2552
michael@0 2553 PIXMAN_IMAGE_GET_LINE (
michael@0 2554 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2555 PIXMAN_IMAGE_GET_LINE (
michael@0 2556 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
michael@0 2557
michael@0 2558 xmm_src = _mm_unpacklo_epi8 (
michael@0 2559 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
michael@0 2560 mmx_src = xmm_src;
michael@0 2561
michael@0 2562 while (height--)
michael@0 2563 {
michael@0 2564 int w = width;
michael@0 2565 const uint32_t *pm = (uint32_t *)mask_line;
michael@0 2566 uint32_t *pd = (uint32_t *)dst_line;
michael@0 2567
michael@0 2568 dst_line += dst_stride;
michael@0 2569 mask_line += mask_stride;
michael@0 2570
michael@0 2571 while (w && (uintptr_t)pd & 15)
michael@0 2572 {
michael@0 2573 m = *pm++;
michael@0 2574
michael@0 2575 if (m)
michael@0 2576 {
michael@0 2577 d = *pd;
michael@0 2578
michael@0 2579 mmx_mask = unpack_32_1x128 (m);
michael@0 2580 mmx_dest = unpack_32_1x128 (d);
michael@0 2581
michael@0 2582 *pd = pack_1x128_32 (
michael@0 2583 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
michael@0 2584 mmx_dest));
michael@0 2585 }
michael@0 2586
michael@0 2587 pd++;
michael@0 2588 w--;
michael@0 2589 }
michael@0 2590
michael@0 2591 while (w >= 4)
michael@0 2592 {
michael@0 2593 xmm_mask = load_128_unaligned ((__m128i*)pm);
michael@0 2594
michael@0 2595 pack_cmp =
michael@0 2596 _mm_movemask_epi8 (
michael@0 2597 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
michael@0 2598
michael@0 2599 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
michael@0 2600 if (pack_cmp != 0xffff)
michael@0 2601 {
michael@0 2602 xmm_dst = load_128_aligned ((__m128i*)pd);
michael@0 2603
michael@0 2604 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2605
michael@0 2606 pix_multiply_2x128 (&xmm_src, &xmm_src,
michael@0 2607 &xmm_mask_lo, &xmm_mask_hi,
michael@0 2608 &xmm_mask_lo, &xmm_mask_hi);
michael@0 2609 xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
michael@0 2610
michael@0 2611 save_128_aligned (
michael@0 2612 (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
michael@0 2613 }
michael@0 2614
michael@0 2615 pd += 4;
michael@0 2616 pm += 4;
michael@0 2617 w -= 4;
michael@0 2618 }
michael@0 2619
michael@0 2620 while (w)
michael@0 2621 {
michael@0 2622 m = *pm++;
michael@0 2623
michael@0 2624 if (m)
michael@0 2625 {
michael@0 2626 d = *pd;
michael@0 2627
michael@0 2628 mmx_mask = unpack_32_1x128 (m);
michael@0 2629 mmx_dest = unpack_32_1x128 (d);
michael@0 2630
michael@0 2631 *pd = pack_1x128_32 (
michael@0 2632 _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
michael@0 2633 mmx_dest));
michael@0 2634 }
michael@0 2635
michael@0 2636 pd++;
michael@0 2637 w--;
michael@0 2638 }
michael@0 2639 }
michael@0 2640
michael@0 2641 }
michael@0 2642
michael@0 2643 static void
michael@0 2644 sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
michael@0 2645 pixman_composite_info_t *info)
michael@0 2646 {
michael@0 2647 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2648 uint32_t src;
michael@0 2649 uint32_t *dst_line, d;
michael@0 2650 uint32_t *mask_line, m;
michael@0 2651 uint32_t pack_cmp;
michael@0 2652 int dst_stride, mask_stride;
michael@0 2653
michael@0 2654 __m128i xmm_src, xmm_alpha;
michael@0 2655 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 2656 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 2657
michael@0 2658 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
michael@0 2659
michael@0 2660 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 2661
michael@0 2662 if (src == 0)
michael@0 2663 return;
michael@0 2664
michael@0 2665 PIXMAN_IMAGE_GET_LINE (
michael@0 2666 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2667 PIXMAN_IMAGE_GET_LINE (
michael@0 2668 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
michael@0 2669
michael@0 2670 xmm_src = _mm_unpacklo_epi8 (
michael@0 2671 create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
michael@0 2672 xmm_alpha = expand_alpha_1x128 (xmm_src);
michael@0 2673 mmx_src = xmm_src;
michael@0 2674 mmx_alpha = xmm_alpha;
michael@0 2675
michael@0 2676 while (height--)
michael@0 2677 {
michael@0 2678 int w = width;
michael@0 2679 const uint32_t *pm = (uint32_t *)mask_line;
michael@0 2680 uint32_t *pd = (uint32_t *)dst_line;
michael@0 2681
michael@0 2682 dst_line += dst_stride;
michael@0 2683 mask_line += mask_stride;
michael@0 2684
michael@0 2685 while (w && (uintptr_t)pd & 15)
michael@0 2686 {
michael@0 2687 m = *pm++;
michael@0 2688
michael@0 2689 if (m)
michael@0 2690 {
michael@0 2691 d = *pd;
michael@0 2692 mmx_mask = unpack_32_1x128 (m);
michael@0 2693 mmx_dest = unpack_32_1x128 (d);
michael@0 2694
michael@0 2695 *pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
michael@0 2696 &mmx_alpha,
michael@0 2697 &mmx_mask,
michael@0 2698 &mmx_dest));
michael@0 2699 }
michael@0 2700
michael@0 2701 pd++;
michael@0 2702 w--;
michael@0 2703 }
michael@0 2704
michael@0 2705 while (w >= 4)
michael@0 2706 {
michael@0 2707 xmm_mask = load_128_unaligned ((__m128i*)pm);
michael@0 2708
michael@0 2709 pack_cmp =
michael@0 2710 _mm_movemask_epi8 (
michael@0 2711 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
michael@0 2712
michael@0 2713 /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
michael@0 2714 if (pack_cmp != 0xffff)
michael@0 2715 {
michael@0 2716 xmm_dst = load_128_aligned ((__m128i*)pd);
michael@0 2717
michael@0 2718 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 2719 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2720
michael@0 2721 in_over_2x128 (&xmm_src, &xmm_src,
michael@0 2722 &xmm_alpha, &xmm_alpha,
michael@0 2723 &xmm_mask_lo, &xmm_mask_hi,
michael@0 2724 &xmm_dst_lo, &xmm_dst_hi);
michael@0 2725
michael@0 2726 save_128_aligned (
michael@0 2727 (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 2728 }
michael@0 2729
michael@0 2730 pd += 4;
michael@0 2731 pm += 4;
michael@0 2732 w -= 4;
michael@0 2733 }
michael@0 2734
michael@0 2735 while (w)
michael@0 2736 {
michael@0 2737 m = *pm++;
michael@0 2738
michael@0 2739 if (m)
michael@0 2740 {
michael@0 2741 d = *pd;
michael@0 2742 mmx_mask = unpack_32_1x128 (m);
michael@0 2743 mmx_dest = unpack_32_1x128 (d);
michael@0 2744
michael@0 2745 *pd = pack_1x128_32 (
michael@0 2746 in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
michael@0 2747 }
michael@0 2748
michael@0 2749 pd++;
michael@0 2750 w--;
michael@0 2751 }
michael@0 2752 }
michael@0 2753
michael@0 2754 }
michael@0 2755
michael@0 2756 static void
michael@0 2757 sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
michael@0 2758 pixman_composite_info_t *info)
michael@0 2759 {
michael@0 2760 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2761 uint32_t *dst_line, *dst;
michael@0 2762 uint32_t *src_line, *src;
michael@0 2763 uint32_t mask;
michael@0 2764 int32_t w;
michael@0 2765 int dst_stride, src_stride;
michael@0 2766
michael@0 2767 __m128i xmm_mask;
michael@0 2768 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 2769 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 2770 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 2771
michael@0 2772 PIXMAN_IMAGE_GET_LINE (
michael@0 2773 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2774 PIXMAN_IMAGE_GET_LINE (
michael@0 2775 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2776
michael@0 2777 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
michael@0 2778
michael@0 2779 xmm_mask = create_mask_16_128 (mask >> 24);
michael@0 2780
michael@0 2781 while (height--)
michael@0 2782 {
michael@0 2783 dst = dst_line;
michael@0 2784 dst_line += dst_stride;
michael@0 2785 src = src_line;
michael@0 2786 src_line += src_stride;
michael@0 2787 w = width;
michael@0 2788
michael@0 2789 while (w && (uintptr_t)dst & 15)
michael@0 2790 {
michael@0 2791 uint32_t s = *src++;
michael@0 2792
michael@0 2793 if (s)
michael@0 2794 {
michael@0 2795 uint32_t d = *dst;
michael@0 2796
michael@0 2797 __m128i ms = unpack_32_1x128 (s);
michael@0 2798 __m128i alpha = expand_alpha_1x128 (ms);
michael@0 2799 __m128i dest = xmm_mask;
michael@0 2800 __m128i alpha_dst = unpack_32_1x128 (d);
michael@0 2801
michael@0 2802 *dst = pack_1x128_32 (
michael@0 2803 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
michael@0 2804 }
michael@0 2805 dst++;
michael@0 2806 w--;
michael@0 2807 }
michael@0 2808
michael@0 2809 while (w >= 4)
michael@0 2810 {
michael@0 2811 xmm_src = load_128_unaligned ((__m128i*)src);
michael@0 2812
michael@0 2813 if (!is_zero (xmm_src))
michael@0 2814 {
michael@0 2815 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 2816
michael@0 2817 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 2818 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 2819 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 2820 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 2821
michael@0 2822 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 2823 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 2824 &xmm_mask, &xmm_mask,
michael@0 2825 &xmm_dst_lo, &xmm_dst_hi);
michael@0 2826
michael@0 2827 save_128_aligned (
michael@0 2828 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 2829 }
michael@0 2830
michael@0 2831 dst += 4;
michael@0 2832 src += 4;
michael@0 2833 w -= 4;
michael@0 2834 }
michael@0 2835
michael@0 2836 while (w)
michael@0 2837 {
michael@0 2838 uint32_t s = *src++;
michael@0 2839
michael@0 2840 if (s)
michael@0 2841 {
michael@0 2842 uint32_t d = *dst;
michael@0 2843
michael@0 2844 __m128i ms = unpack_32_1x128 (s);
michael@0 2845 __m128i alpha = expand_alpha_1x128 (ms);
michael@0 2846 __m128i mask = xmm_mask;
michael@0 2847 __m128i dest = unpack_32_1x128 (d);
michael@0 2848
michael@0 2849 *dst = pack_1x128_32 (
michael@0 2850 in_over_1x128 (&ms, &alpha, &mask, &dest));
michael@0 2851 }
michael@0 2852
michael@0 2853 dst++;
michael@0 2854 w--;
michael@0 2855 }
michael@0 2856 }
michael@0 2857
michael@0 2858 }
michael@0 2859
michael@0 2860 static void
michael@0 2861 sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
michael@0 2862 pixman_composite_info_t *info)
michael@0 2863 {
michael@0 2864 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2865 uint16_t *dst_line, *dst;
michael@0 2866 uint32_t *src_line, *src, s;
michael@0 2867 int dst_stride, src_stride;
michael@0 2868 int32_t w;
michael@0 2869
michael@0 2870 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2871 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 2872
michael@0 2873 while (height--)
michael@0 2874 {
michael@0 2875 dst = dst_line;
michael@0 2876 dst_line += dst_stride;
michael@0 2877 src = src_line;
michael@0 2878 src_line += src_stride;
michael@0 2879 w = width;
michael@0 2880
michael@0 2881 while (w && (uintptr_t)dst & 15)
michael@0 2882 {
michael@0 2883 s = *src++;
michael@0 2884 *dst = convert_8888_to_0565 (s);
michael@0 2885 dst++;
michael@0 2886 w--;
michael@0 2887 }
michael@0 2888
michael@0 2889 while (w >= 8)
michael@0 2890 {
michael@0 2891 __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
michael@0 2892 __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
michael@0 2893
michael@0 2894 save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
michael@0 2895
michael@0 2896 w -= 8;
michael@0 2897 src += 8;
michael@0 2898 dst += 8;
michael@0 2899 }
michael@0 2900
michael@0 2901 while (w)
michael@0 2902 {
michael@0 2903 s = *src++;
michael@0 2904 *dst = convert_8888_to_0565 (s);
michael@0 2905 dst++;
michael@0 2906 w--;
michael@0 2907 }
michael@0 2908 }
michael@0 2909 }
michael@0 2910
michael@0 2911 static void
michael@0 2912 sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
michael@0 2913 pixman_composite_info_t *info)
michael@0 2914 {
michael@0 2915 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2916 uint32_t *dst_line, *dst;
michael@0 2917 uint32_t *src_line, *src;
michael@0 2918 int32_t w;
michael@0 2919 int dst_stride, src_stride;
michael@0 2920
michael@0 2921
michael@0 2922 PIXMAN_IMAGE_GET_LINE (
michael@0 2923 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2924 PIXMAN_IMAGE_GET_LINE (
michael@0 2925 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2926
michael@0 2927 while (height--)
michael@0 2928 {
michael@0 2929 dst = dst_line;
michael@0 2930 dst_line += dst_stride;
michael@0 2931 src = src_line;
michael@0 2932 src_line += src_stride;
michael@0 2933 w = width;
michael@0 2934
michael@0 2935 while (w && (uintptr_t)dst & 15)
michael@0 2936 {
michael@0 2937 *dst++ = *src++ | 0xff000000;
michael@0 2938 w--;
michael@0 2939 }
michael@0 2940
michael@0 2941 while (w >= 16)
michael@0 2942 {
michael@0 2943 __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
michael@0 2944
michael@0 2945 xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
michael@0 2946 xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
michael@0 2947 xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
michael@0 2948 xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
michael@0 2949
michael@0 2950 save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
michael@0 2951 save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
michael@0 2952 save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
michael@0 2953 save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
michael@0 2954
michael@0 2955 dst += 16;
michael@0 2956 src += 16;
michael@0 2957 w -= 16;
michael@0 2958 }
michael@0 2959
michael@0 2960 while (w)
michael@0 2961 {
michael@0 2962 *dst++ = *src++ | 0xff000000;
michael@0 2963 w--;
michael@0 2964 }
michael@0 2965 }
michael@0 2966
michael@0 2967 }
michael@0 2968
michael@0 2969 static void
michael@0 2970 sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
michael@0 2971 pixman_composite_info_t *info)
michael@0 2972 {
michael@0 2973 PIXMAN_COMPOSITE_ARGS (info);
michael@0 2974 uint32_t *dst_line, *dst;
michael@0 2975 uint32_t *src_line, *src;
michael@0 2976 uint32_t mask;
michael@0 2977 int dst_stride, src_stride;
michael@0 2978 int32_t w;
michael@0 2979
michael@0 2980 __m128i xmm_mask, xmm_alpha;
michael@0 2981 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 2982 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 2983
michael@0 2984 PIXMAN_IMAGE_GET_LINE (
michael@0 2985 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 2986 PIXMAN_IMAGE_GET_LINE (
michael@0 2987 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 2988
michael@0 2989 mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
michael@0 2990
michael@0 2991 xmm_mask = create_mask_16_128 (mask >> 24);
michael@0 2992 xmm_alpha = mask_00ff;
michael@0 2993
michael@0 2994 while (height--)
michael@0 2995 {
michael@0 2996 dst = dst_line;
michael@0 2997 dst_line += dst_stride;
michael@0 2998 src = src_line;
michael@0 2999 src_line += src_stride;
michael@0 3000 w = width;
michael@0 3001
michael@0 3002 while (w && (uintptr_t)dst & 15)
michael@0 3003 {
michael@0 3004 uint32_t s = (*src++) | 0xff000000;
michael@0 3005 uint32_t d = *dst;
michael@0 3006
michael@0 3007 __m128i src = unpack_32_1x128 (s);
michael@0 3008 __m128i alpha = xmm_alpha;
michael@0 3009 __m128i mask = xmm_mask;
michael@0 3010 __m128i dest = unpack_32_1x128 (d);
michael@0 3011
michael@0 3012 *dst++ = pack_1x128_32 (
michael@0 3013 in_over_1x128 (&src, &alpha, &mask, &dest));
michael@0 3014
michael@0 3015 w--;
michael@0 3016 }
michael@0 3017
michael@0 3018 while (w >= 4)
michael@0 3019 {
michael@0 3020 xmm_src = _mm_or_si128 (
michael@0 3021 load_128_unaligned ((__m128i*)src), mask_ff000000);
michael@0 3022 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 3023
michael@0 3024 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 3025 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 3026
michael@0 3027 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 3028 &xmm_alpha, &xmm_alpha,
michael@0 3029 &xmm_mask, &xmm_mask,
michael@0 3030 &xmm_dst_lo, &xmm_dst_hi);
michael@0 3031
michael@0 3032 save_128_aligned (
michael@0 3033 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 3034
michael@0 3035 dst += 4;
michael@0 3036 src += 4;
michael@0 3037 w -= 4;
michael@0 3038
michael@0 3039 }
michael@0 3040
michael@0 3041 while (w)
michael@0 3042 {
michael@0 3043 uint32_t s = (*src++) | 0xff000000;
michael@0 3044 uint32_t d = *dst;
michael@0 3045
michael@0 3046 __m128i src = unpack_32_1x128 (s);
michael@0 3047 __m128i alpha = xmm_alpha;
michael@0 3048 __m128i mask = xmm_mask;
michael@0 3049 __m128i dest = unpack_32_1x128 (d);
michael@0 3050
michael@0 3051 *dst++ = pack_1x128_32 (
michael@0 3052 in_over_1x128 (&src, &alpha, &mask, &dest));
michael@0 3053
michael@0 3054 w--;
michael@0 3055 }
michael@0 3056 }
michael@0 3057
michael@0 3058 }
michael@0 3059
michael@0 3060 static void
michael@0 3061 sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
michael@0 3062 pixman_composite_info_t *info)
michael@0 3063 {
michael@0 3064 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3065 int dst_stride, src_stride;
michael@0 3066 uint32_t *dst_line, *dst;
michael@0 3067 uint32_t *src_line, *src;
michael@0 3068
michael@0 3069 PIXMAN_IMAGE_GET_LINE (
michael@0 3070 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3071 PIXMAN_IMAGE_GET_LINE (
michael@0 3072 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 3073
michael@0 3074 dst = dst_line;
michael@0 3075 src = src_line;
michael@0 3076
michael@0 3077 while (height--)
michael@0 3078 {
michael@0 3079 sse2_combine_over_u (imp, op, dst, src, NULL, width);
michael@0 3080
michael@0 3081 dst += dst_stride;
michael@0 3082 src += src_stride;
michael@0 3083 }
michael@0 3084 }
michael@0 3085
michael@0 3086 static force_inline uint16_t
michael@0 3087 composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
michael@0 3088 {
michael@0 3089 __m128i ms;
michael@0 3090
michael@0 3091 ms = unpack_32_1x128 (src);
michael@0 3092 return pack_565_32_16 (
michael@0 3093 pack_1x128_32 (
michael@0 3094 over_1x128 (
michael@0 3095 ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
michael@0 3096 }
michael@0 3097
michael@0 3098 static void
michael@0 3099 sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
michael@0 3100 pixman_composite_info_t *info)
michael@0 3101 {
michael@0 3102 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3103 uint16_t *dst_line, *dst, d;
michael@0 3104 uint32_t *src_line, *src, s;
michael@0 3105 int dst_stride, src_stride;
michael@0 3106 int32_t w;
michael@0 3107
michael@0 3108 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 3109 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 3110 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
michael@0 3111
michael@0 3112 PIXMAN_IMAGE_GET_LINE (
michael@0 3113 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 3114 PIXMAN_IMAGE_GET_LINE (
michael@0 3115 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 3116
michael@0 3117 while (height--)
michael@0 3118 {
michael@0 3119 dst = dst_line;
michael@0 3120 src = src_line;
michael@0 3121
michael@0 3122 dst_line += dst_stride;
michael@0 3123 src_line += src_stride;
michael@0 3124 w = width;
michael@0 3125
michael@0 3126 /* Align dst on a 16-byte boundary */
michael@0 3127 while (w &&
michael@0 3128 ((uintptr_t)dst & 15))
michael@0 3129 {
michael@0 3130 s = *src++;
michael@0 3131 d = *dst;
michael@0 3132
michael@0 3133 *dst++ = composite_over_8888_0565pixel (s, d);
michael@0 3134 w--;
michael@0 3135 }
michael@0 3136
michael@0 3137 /* It's a 8 pixel loop */
michael@0 3138 while (w >= 8)
michael@0 3139 {
michael@0 3140 /* I'm loading unaligned because I'm not sure
michael@0 3141 * about the address alignment.
michael@0 3142 */
michael@0 3143 xmm_src = load_128_unaligned ((__m128i*) src);
michael@0 3144 xmm_dst = load_128_aligned ((__m128i*) dst);
michael@0 3145
michael@0 3146 /* Unpacking */
michael@0 3147 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 3148 unpack_565_128_4x128 (xmm_dst,
michael@0 3149 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
michael@0 3150 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3151 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 3152
michael@0 3153 /* I'm loading next 4 pixels from memory
michael@0 3154 * before to optimze the memory read.
michael@0 3155 */
michael@0 3156 xmm_src = load_128_unaligned ((__m128i*) (src + 4));
michael@0 3157
michael@0 3158 over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 3159 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 3160 &xmm_dst0, &xmm_dst1);
michael@0 3161
michael@0 3162 /* Unpacking */
michael@0 3163 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 3164 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3165 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 3166
michael@0 3167 over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 3168 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 3169 &xmm_dst2, &xmm_dst3);
michael@0 3170
michael@0 3171 save_128_aligned (
michael@0 3172 (__m128i*)dst, pack_565_4x128_128 (
michael@0 3173 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
michael@0 3174
michael@0 3175 w -= 8;
michael@0 3176 dst += 8;
michael@0 3177 src += 8;
michael@0 3178 }
michael@0 3179
michael@0 3180 while (w--)
michael@0 3181 {
michael@0 3182 s = *src++;
michael@0 3183 d = *dst;
michael@0 3184
michael@0 3185 *dst++ = composite_over_8888_0565pixel (s, d);
michael@0 3186 }
michael@0 3187 }
michael@0 3188
michael@0 3189 }
michael@0 3190
michael@0 3191 static void
michael@0 3192 sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
michael@0 3193 pixman_composite_info_t *info)
michael@0 3194 {
michael@0 3195 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3196 uint32_t src, srca;
michael@0 3197 uint32_t *dst_line, *dst;
michael@0 3198 uint8_t *mask_line, *mask;
michael@0 3199 int dst_stride, mask_stride;
michael@0 3200 int32_t w;
michael@0 3201 uint32_t m, d;
michael@0 3202
michael@0 3203 __m128i xmm_src, xmm_alpha, xmm_def;
michael@0 3204 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 3205 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 3206
michael@0 3207 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
michael@0 3208
michael@0 3209 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 3210
michael@0 3211 srca = src >> 24;
michael@0 3212 if (src == 0)
michael@0 3213 return;
michael@0 3214
michael@0 3215 PIXMAN_IMAGE_GET_LINE (
michael@0 3216 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3217 PIXMAN_IMAGE_GET_LINE (
michael@0 3218 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 3219
michael@0 3220 xmm_def = create_mask_2x32_128 (src, src);
michael@0 3221 xmm_src = expand_pixel_32_1x128 (src);
michael@0 3222 xmm_alpha = expand_alpha_1x128 (xmm_src);
michael@0 3223 mmx_src = xmm_src;
michael@0 3224 mmx_alpha = xmm_alpha;
michael@0 3225
michael@0 3226 while (height--)
michael@0 3227 {
michael@0 3228 dst = dst_line;
michael@0 3229 dst_line += dst_stride;
michael@0 3230 mask = mask_line;
michael@0 3231 mask_line += mask_stride;
michael@0 3232 w = width;
michael@0 3233
michael@0 3234 while (w && (uintptr_t)dst & 15)
michael@0 3235 {
michael@0 3236 uint8_t m = *mask++;
michael@0 3237
michael@0 3238 if (m)
michael@0 3239 {
michael@0 3240 d = *dst;
michael@0 3241 mmx_mask = expand_pixel_8_1x128 (m);
michael@0 3242 mmx_dest = unpack_32_1x128 (d);
michael@0 3243
michael@0 3244 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
michael@0 3245 &mmx_alpha,
michael@0 3246 &mmx_mask,
michael@0 3247 &mmx_dest));
michael@0 3248 }
michael@0 3249
michael@0 3250 w--;
michael@0 3251 dst++;
michael@0 3252 }
michael@0 3253
michael@0 3254 while (w >= 4)
michael@0 3255 {
michael@0 3256 m = *((uint32_t*)mask);
michael@0 3257
michael@0 3258 if (srca == 0xff && m == 0xffffffff)
michael@0 3259 {
michael@0 3260 save_128_aligned ((__m128i*)dst, xmm_def);
michael@0 3261 }
michael@0 3262 else if (m)
michael@0 3263 {
michael@0 3264 xmm_dst = load_128_aligned ((__m128i*) dst);
michael@0 3265 xmm_mask = unpack_32_1x128 (m);
michael@0 3266 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
michael@0 3267
michael@0 3268 /* Unpacking */
michael@0 3269 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 3270 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 3271
michael@0 3272 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 3273 &xmm_mask_lo, &xmm_mask_hi);
michael@0 3274
michael@0 3275 in_over_2x128 (&xmm_src, &xmm_src,
michael@0 3276 &xmm_alpha, &xmm_alpha,
michael@0 3277 &xmm_mask_lo, &xmm_mask_hi,
michael@0 3278 &xmm_dst_lo, &xmm_dst_hi);
michael@0 3279
michael@0 3280 save_128_aligned (
michael@0 3281 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 3282 }
michael@0 3283
michael@0 3284 w -= 4;
michael@0 3285 dst += 4;
michael@0 3286 mask += 4;
michael@0 3287 }
michael@0 3288
michael@0 3289 while (w)
michael@0 3290 {
michael@0 3291 uint8_t m = *mask++;
michael@0 3292
michael@0 3293 if (m)
michael@0 3294 {
michael@0 3295 d = *dst;
michael@0 3296 mmx_mask = expand_pixel_8_1x128 (m);
michael@0 3297 mmx_dest = unpack_32_1x128 (d);
michael@0 3298
michael@0 3299 *dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
michael@0 3300 &mmx_alpha,
michael@0 3301 &mmx_mask,
michael@0 3302 &mmx_dest));
michael@0 3303 }
michael@0 3304
michael@0 3305 w--;
michael@0 3306 dst++;
michael@0 3307 }
michael@0 3308 }
michael@0 3309
michael@0 3310 }
michael@0 3311
michael@0 3312 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
michael@0 3313 __attribute__((__force_align_arg_pointer__))
michael@0 3314 #endif
michael@0 3315 static pixman_bool_t
michael@0 3316 sse2_fill (pixman_implementation_t *imp,
michael@0 3317 uint32_t * bits,
michael@0 3318 int stride,
michael@0 3319 int bpp,
michael@0 3320 int x,
michael@0 3321 int y,
michael@0 3322 int width,
michael@0 3323 int height,
michael@0 3324 uint32_t filler)
michael@0 3325 {
michael@0 3326 uint32_t byte_width;
michael@0 3327 uint8_t *byte_line;
michael@0 3328
michael@0 3329 __m128i xmm_def;
michael@0 3330
michael@0 3331 if (bpp == 8)
michael@0 3332 {
michael@0 3333 uint8_t b;
michael@0 3334 uint16_t w;
michael@0 3335
michael@0 3336 stride = stride * (int) sizeof (uint32_t) / 1;
michael@0 3337 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
michael@0 3338 byte_width = width;
michael@0 3339 stride *= 1;
michael@0 3340
michael@0 3341 b = filler & 0xff;
michael@0 3342 w = (b << 8) | b;
michael@0 3343 filler = (w << 16) | w;
michael@0 3344 }
michael@0 3345 else if (bpp == 16)
michael@0 3346 {
michael@0 3347 stride = stride * (int) sizeof (uint32_t) / 2;
michael@0 3348 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
michael@0 3349 byte_width = 2 * width;
michael@0 3350 stride *= 2;
michael@0 3351
michael@0 3352 filler = (filler & 0xffff) * 0x00010001;
michael@0 3353 }
michael@0 3354 else if (bpp == 32)
michael@0 3355 {
michael@0 3356 stride = stride * (int) sizeof (uint32_t) / 4;
michael@0 3357 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
michael@0 3358 byte_width = 4 * width;
michael@0 3359 stride *= 4;
michael@0 3360 }
michael@0 3361 else
michael@0 3362 {
michael@0 3363 return FALSE;
michael@0 3364 }
michael@0 3365
michael@0 3366 xmm_def = create_mask_2x32_128 (filler, filler);
michael@0 3367
michael@0 3368 while (height--)
michael@0 3369 {
michael@0 3370 int w;
michael@0 3371 uint8_t *d = byte_line;
michael@0 3372 byte_line += stride;
michael@0 3373 w = byte_width;
michael@0 3374
michael@0 3375 if (w >= 1 && ((uintptr_t)d & 1))
michael@0 3376 {
michael@0 3377 *(uint8_t *)d = filler;
michael@0 3378 w -= 1;
michael@0 3379 d += 1;
michael@0 3380 }
michael@0 3381
michael@0 3382 while (w >= 2 && ((uintptr_t)d & 3))
michael@0 3383 {
michael@0 3384 *(uint16_t *)d = filler;
michael@0 3385 w -= 2;
michael@0 3386 d += 2;
michael@0 3387 }
michael@0 3388
michael@0 3389 while (w >= 4 && ((uintptr_t)d & 15))
michael@0 3390 {
michael@0 3391 *(uint32_t *)d = filler;
michael@0 3392
michael@0 3393 w -= 4;
michael@0 3394 d += 4;
michael@0 3395 }
michael@0 3396
michael@0 3397 while (w >= 128)
michael@0 3398 {
michael@0 3399 save_128_aligned ((__m128i*)(d), xmm_def);
michael@0 3400 save_128_aligned ((__m128i*)(d + 16), xmm_def);
michael@0 3401 save_128_aligned ((__m128i*)(d + 32), xmm_def);
michael@0 3402 save_128_aligned ((__m128i*)(d + 48), xmm_def);
michael@0 3403 save_128_aligned ((__m128i*)(d + 64), xmm_def);
michael@0 3404 save_128_aligned ((__m128i*)(d + 80), xmm_def);
michael@0 3405 save_128_aligned ((__m128i*)(d + 96), xmm_def);
michael@0 3406 save_128_aligned ((__m128i*)(d + 112), xmm_def);
michael@0 3407
michael@0 3408 d += 128;
michael@0 3409 w -= 128;
michael@0 3410 }
michael@0 3411
michael@0 3412 if (w >= 64)
michael@0 3413 {
michael@0 3414 save_128_aligned ((__m128i*)(d), xmm_def);
michael@0 3415 save_128_aligned ((__m128i*)(d + 16), xmm_def);
michael@0 3416 save_128_aligned ((__m128i*)(d + 32), xmm_def);
michael@0 3417 save_128_aligned ((__m128i*)(d + 48), xmm_def);
michael@0 3418
michael@0 3419 d += 64;
michael@0 3420 w -= 64;
michael@0 3421 }
michael@0 3422
michael@0 3423 if (w >= 32)
michael@0 3424 {
michael@0 3425 save_128_aligned ((__m128i*)(d), xmm_def);
michael@0 3426 save_128_aligned ((__m128i*)(d + 16), xmm_def);
michael@0 3427
michael@0 3428 d += 32;
michael@0 3429 w -= 32;
michael@0 3430 }
michael@0 3431
michael@0 3432 if (w >= 16)
michael@0 3433 {
michael@0 3434 save_128_aligned ((__m128i*)(d), xmm_def);
michael@0 3435
michael@0 3436 d += 16;
michael@0 3437 w -= 16;
michael@0 3438 }
michael@0 3439
michael@0 3440 while (w >= 4)
michael@0 3441 {
michael@0 3442 *(uint32_t *)d = filler;
michael@0 3443
michael@0 3444 w -= 4;
michael@0 3445 d += 4;
michael@0 3446 }
michael@0 3447
michael@0 3448 if (w >= 2)
michael@0 3449 {
michael@0 3450 *(uint16_t *)d = filler;
michael@0 3451 w -= 2;
michael@0 3452 d += 2;
michael@0 3453 }
michael@0 3454
michael@0 3455 if (w >= 1)
michael@0 3456 {
michael@0 3457 *(uint8_t *)d = filler;
michael@0 3458 w -= 1;
michael@0 3459 d += 1;
michael@0 3460 }
michael@0 3461 }
michael@0 3462
michael@0 3463 return TRUE;
michael@0 3464 }
michael@0 3465
michael@0 3466 static void
michael@0 3467 sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
michael@0 3468 pixman_composite_info_t *info)
michael@0 3469 {
michael@0 3470 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3471 uint32_t src, srca;
michael@0 3472 uint32_t *dst_line, *dst;
michael@0 3473 uint8_t *mask_line, *mask;
michael@0 3474 int dst_stride, mask_stride;
michael@0 3475 int32_t w;
michael@0 3476 uint32_t m;
michael@0 3477
michael@0 3478 __m128i xmm_src, xmm_def;
michael@0 3479 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 3480
michael@0 3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 3482
michael@0 3483 srca = src >> 24;
michael@0 3484 if (src == 0)
michael@0 3485 {
michael@0 3486 sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
michael@0 3487 PIXMAN_FORMAT_BPP (dest_image->bits.format),
michael@0 3488 dest_x, dest_y, width, height, 0);
michael@0 3489 return;
michael@0 3490 }
michael@0 3491
michael@0 3492 PIXMAN_IMAGE_GET_LINE (
michael@0 3493 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3494 PIXMAN_IMAGE_GET_LINE (
michael@0 3495 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 3496
michael@0 3497 xmm_def = create_mask_2x32_128 (src, src);
michael@0 3498 xmm_src = expand_pixel_32_1x128 (src);
michael@0 3499
michael@0 3500 while (height--)
michael@0 3501 {
michael@0 3502 dst = dst_line;
michael@0 3503 dst_line += dst_stride;
michael@0 3504 mask = mask_line;
michael@0 3505 mask_line += mask_stride;
michael@0 3506 w = width;
michael@0 3507
michael@0 3508 while (w && (uintptr_t)dst & 15)
michael@0 3509 {
michael@0 3510 uint8_t m = *mask++;
michael@0 3511
michael@0 3512 if (m)
michael@0 3513 {
michael@0 3514 *dst = pack_1x128_32 (
michael@0 3515 pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
michael@0 3516 }
michael@0 3517 else
michael@0 3518 {
michael@0 3519 *dst = 0;
michael@0 3520 }
michael@0 3521
michael@0 3522 w--;
michael@0 3523 dst++;
michael@0 3524 }
michael@0 3525
michael@0 3526 while (w >= 4)
michael@0 3527 {
michael@0 3528 m = *((uint32_t*)mask);
michael@0 3529
michael@0 3530 if (srca == 0xff && m == 0xffffffff)
michael@0 3531 {
michael@0 3532 save_128_aligned ((__m128i*)dst, xmm_def);
michael@0 3533 }
michael@0 3534 else if (m)
michael@0 3535 {
michael@0 3536 xmm_mask = unpack_32_1x128 (m);
michael@0 3537 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
michael@0 3538
michael@0 3539 /* Unpacking */
michael@0 3540 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 3541
michael@0 3542 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 3543 &xmm_mask_lo, &xmm_mask_hi);
michael@0 3544
michael@0 3545 pix_multiply_2x128 (&xmm_src, &xmm_src,
michael@0 3546 &xmm_mask_lo, &xmm_mask_hi,
michael@0 3547 &xmm_mask_lo, &xmm_mask_hi);
michael@0 3548
michael@0 3549 save_128_aligned (
michael@0 3550 (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
michael@0 3551 }
michael@0 3552 else
michael@0 3553 {
michael@0 3554 save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
michael@0 3555 }
michael@0 3556
michael@0 3557 w -= 4;
michael@0 3558 dst += 4;
michael@0 3559 mask += 4;
michael@0 3560 }
michael@0 3561
michael@0 3562 while (w)
michael@0 3563 {
michael@0 3564 uint8_t m = *mask++;
michael@0 3565
michael@0 3566 if (m)
michael@0 3567 {
michael@0 3568 *dst = pack_1x128_32 (
michael@0 3569 pix_multiply_1x128 (
michael@0 3570 xmm_src, expand_pixel_8_1x128 (m)));
michael@0 3571 }
michael@0 3572 else
michael@0 3573 {
michael@0 3574 *dst = 0;
michael@0 3575 }
michael@0 3576
michael@0 3577 w--;
michael@0 3578 dst++;
michael@0 3579 }
michael@0 3580 }
michael@0 3581
michael@0 3582 }
michael@0 3583
michael@0 3584 static void
michael@0 3585 sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
michael@0 3586 pixman_composite_info_t *info)
michael@0 3587 {
michael@0 3588 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3589 uint32_t src;
michael@0 3590 uint16_t *dst_line, *dst, d;
michael@0 3591 uint8_t *mask_line, *mask;
michael@0 3592 int dst_stride, mask_stride;
michael@0 3593 int32_t w;
michael@0 3594 uint32_t m;
michael@0 3595 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
michael@0 3596
michael@0 3597 __m128i xmm_src, xmm_alpha;
michael@0 3598 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 3599 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
michael@0 3600
michael@0 3601 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 3602
michael@0 3603 if (src == 0)
michael@0 3604 return;
michael@0 3605
michael@0 3606 PIXMAN_IMAGE_GET_LINE (
michael@0 3607 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 3608 PIXMAN_IMAGE_GET_LINE (
michael@0 3609 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 3610
michael@0 3611 xmm_src = expand_pixel_32_1x128 (src);
michael@0 3612 xmm_alpha = expand_alpha_1x128 (xmm_src);
michael@0 3613 mmx_src = xmm_src;
michael@0 3614 mmx_alpha = xmm_alpha;
michael@0 3615
michael@0 3616 while (height--)
michael@0 3617 {
michael@0 3618 dst = dst_line;
michael@0 3619 dst_line += dst_stride;
michael@0 3620 mask = mask_line;
michael@0 3621 mask_line += mask_stride;
michael@0 3622 w = width;
michael@0 3623
michael@0 3624 while (w && (uintptr_t)dst & 15)
michael@0 3625 {
michael@0 3626 m = *mask++;
michael@0 3627
michael@0 3628 if (m)
michael@0 3629 {
michael@0 3630 d = *dst;
michael@0 3631 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
michael@0 3632 mmx_dest = expand565_16_1x128 (d);
michael@0 3633
michael@0 3634 *dst = pack_565_32_16 (
michael@0 3635 pack_1x128_32 (
michael@0 3636 in_over_1x128 (
michael@0 3637 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
michael@0 3638 }
michael@0 3639
michael@0 3640 w--;
michael@0 3641 dst++;
michael@0 3642 }
michael@0 3643
michael@0 3644 while (w >= 8)
michael@0 3645 {
michael@0 3646 xmm_dst = load_128_aligned ((__m128i*) dst);
michael@0 3647 unpack_565_128_4x128 (xmm_dst,
michael@0 3648 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
michael@0 3649
michael@0 3650 m = *((uint32_t*)mask);
michael@0 3651 mask += 4;
michael@0 3652
michael@0 3653 if (m)
michael@0 3654 {
michael@0 3655 xmm_mask = unpack_32_1x128 (m);
michael@0 3656 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
michael@0 3657
michael@0 3658 /* Unpacking */
michael@0 3659 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 3660
michael@0 3661 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 3662 &xmm_mask_lo, &xmm_mask_hi);
michael@0 3663
michael@0 3664 in_over_2x128 (&xmm_src, &xmm_src,
michael@0 3665 &xmm_alpha, &xmm_alpha,
michael@0 3666 &xmm_mask_lo, &xmm_mask_hi,
michael@0 3667 &xmm_dst0, &xmm_dst1);
michael@0 3668 }
michael@0 3669
michael@0 3670 m = *((uint32_t*)mask);
michael@0 3671 mask += 4;
michael@0 3672
michael@0 3673 if (m)
michael@0 3674 {
michael@0 3675 xmm_mask = unpack_32_1x128 (m);
michael@0 3676 xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
michael@0 3677
michael@0 3678 /* Unpacking */
michael@0 3679 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 3680
michael@0 3681 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 3682 &xmm_mask_lo, &xmm_mask_hi);
michael@0 3683 in_over_2x128 (&xmm_src, &xmm_src,
michael@0 3684 &xmm_alpha, &xmm_alpha,
michael@0 3685 &xmm_mask_lo, &xmm_mask_hi,
michael@0 3686 &xmm_dst2, &xmm_dst3);
michael@0 3687 }
michael@0 3688
michael@0 3689 save_128_aligned (
michael@0 3690 (__m128i*)dst, pack_565_4x128_128 (
michael@0 3691 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
michael@0 3692
michael@0 3693 w -= 8;
michael@0 3694 dst += 8;
michael@0 3695 }
michael@0 3696
michael@0 3697 while (w)
michael@0 3698 {
michael@0 3699 m = *mask++;
michael@0 3700
michael@0 3701 if (m)
michael@0 3702 {
michael@0 3703 d = *dst;
michael@0 3704 mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
michael@0 3705 mmx_dest = expand565_16_1x128 (d);
michael@0 3706
michael@0 3707 *dst = pack_565_32_16 (
michael@0 3708 pack_1x128_32 (
michael@0 3709 in_over_1x128 (
michael@0 3710 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
michael@0 3711 }
michael@0 3712
michael@0 3713 w--;
michael@0 3714 dst++;
michael@0 3715 }
michael@0 3716 }
michael@0 3717
michael@0 3718 }
michael@0 3719
michael@0 3720 static void
michael@0 3721 sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
michael@0 3722 pixman_composite_info_t *info)
michael@0 3723 {
michael@0 3724 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3725 uint16_t *dst_line, *dst, d;
michael@0 3726 uint32_t *src_line, *src, s;
michael@0 3727 int dst_stride, src_stride;
michael@0 3728 int32_t w;
michael@0 3729 uint32_t opaque, zero;
michael@0 3730
michael@0 3731 __m128i ms;
michael@0 3732 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 3733 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
michael@0 3734
michael@0 3735 PIXMAN_IMAGE_GET_LINE (
michael@0 3736 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 3737 PIXMAN_IMAGE_GET_LINE (
michael@0 3738 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 3739
michael@0 3740 while (height--)
michael@0 3741 {
michael@0 3742 dst = dst_line;
michael@0 3743 dst_line += dst_stride;
michael@0 3744 src = src_line;
michael@0 3745 src_line += src_stride;
michael@0 3746 w = width;
michael@0 3747
michael@0 3748 while (w && (uintptr_t)dst & 15)
michael@0 3749 {
michael@0 3750 s = *src++;
michael@0 3751 d = *dst;
michael@0 3752
michael@0 3753 ms = unpack_32_1x128 (s);
michael@0 3754
michael@0 3755 *dst++ = pack_565_32_16 (
michael@0 3756 pack_1x128_32 (
michael@0 3757 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
michael@0 3758 w--;
michael@0 3759 }
michael@0 3760
michael@0 3761 while (w >= 8)
michael@0 3762 {
michael@0 3763 /* First round */
michael@0 3764 xmm_src = load_128_unaligned ((__m128i*)src);
michael@0 3765 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 3766
michael@0 3767 opaque = is_opaque (xmm_src);
michael@0 3768 zero = is_zero (xmm_src);
michael@0 3769
michael@0 3770 unpack_565_128_4x128 (xmm_dst,
michael@0 3771 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
michael@0 3772 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 3773
michael@0 3774 /* preload next round*/
michael@0 3775 xmm_src = load_128_unaligned ((__m128i*)(src + 4));
michael@0 3776
michael@0 3777 if (opaque)
michael@0 3778 {
michael@0 3779 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3780 &xmm_dst0, &xmm_dst1);
michael@0 3781 }
michael@0 3782 else if (!zero)
michael@0 3783 {
michael@0 3784 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3785 &xmm_dst0, &xmm_dst1);
michael@0 3786 }
michael@0 3787
michael@0 3788 /* Second round */
michael@0 3789 opaque = is_opaque (xmm_src);
michael@0 3790 zero = is_zero (xmm_src);
michael@0 3791
michael@0 3792 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 3793
michael@0 3794 if (opaque)
michael@0 3795 {
michael@0 3796 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3797 &xmm_dst2, &xmm_dst3);
michael@0 3798 }
michael@0 3799 else if (!zero)
michael@0 3800 {
michael@0 3801 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3802 &xmm_dst2, &xmm_dst3);
michael@0 3803 }
michael@0 3804
michael@0 3805 save_128_aligned (
michael@0 3806 (__m128i*)dst, pack_565_4x128_128 (
michael@0 3807 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
michael@0 3808
michael@0 3809 w -= 8;
michael@0 3810 src += 8;
michael@0 3811 dst += 8;
michael@0 3812 }
michael@0 3813
michael@0 3814 while (w)
michael@0 3815 {
michael@0 3816 s = *src++;
michael@0 3817 d = *dst;
michael@0 3818
michael@0 3819 ms = unpack_32_1x128 (s);
michael@0 3820
michael@0 3821 *dst++ = pack_565_32_16 (
michael@0 3822 pack_1x128_32 (
michael@0 3823 over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
michael@0 3824 w--;
michael@0 3825 }
michael@0 3826 }
michael@0 3827
michael@0 3828 }
michael@0 3829
michael@0 3830 static void
michael@0 3831 sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
michael@0 3832 pixman_composite_info_t *info)
michael@0 3833 {
michael@0 3834 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3835 uint32_t *dst_line, *dst, d;
michael@0 3836 uint32_t *src_line, *src, s;
michael@0 3837 int dst_stride, src_stride;
michael@0 3838 int32_t w;
michael@0 3839 uint32_t opaque, zero;
michael@0 3840
michael@0 3841 __m128i xmm_src_lo, xmm_src_hi;
michael@0 3842 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 3843
michael@0 3844 PIXMAN_IMAGE_GET_LINE (
michael@0 3845 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 3846 PIXMAN_IMAGE_GET_LINE (
michael@0 3847 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 3848
michael@0 3849 while (height--)
michael@0 3850 {
michael@0 3851 dst = dst_line;
michael@0 3852 dst_line += dst_stride;
michael@0 3853 src = src_line;
michael@0 3854 src_line += src_stride;
michael@0 3855 w = width;
michael@0 3856
michael@0 3857 while (w && (uintptr_t)dst & 15)
michael@0 3858 {
michael@0 3859 s = *src++;
michael@0 3860 d = *dst;
michael@0 3861
michael@0 3862 *dst++ = pack_1x128_32 (
michael@0 3863 over_rev_non_pre_1x128 (
michael@0 3864 unpack_32_1x128 (s), unpack_32_1x128 (d)));
michael@0 3865
michael@0 3866 w--;
michael@0 3867 }
michael@0 3868
michael@0 3869 while (w >= 4)
michael@0 3870 {
michael@0 3871 xmm_src_hi = load_128_unaligned ((__m128i*)src);
michael@0 3872
michael@0 3873 opaque = is_opaque (xmm_src_hi);
michael@0 3874 zero = is_zero (xmm_src_hi);
michael@0 3875
michael@0 3876 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 3877
michael@0 3878 if (opaque)
michael@0 3879 {
michael@0 3880 invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3881 &xmm_dst_lo, &xmm_dst_hi);
michael@0 3882
michael@0 3883 save_128_aligned (
michael@0 3884 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 3885 }
michael@0 3886 else if (!zero)
michael@0 3887 {
michael@0 3888 xmm_dst_hi = load_128_aligned ((__m128i*)dst);
michael@0 3889
michael@0 3890 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 3891
michael@0 3892 over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 3893 &xmm_dst_lo, &xmm_dst_hi);
michael@0 3894
michael@0 3895 save_128_aligned (
michael@0 3896 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 3897 }
michael@0 3898
michael@0 3899 w -= 4;
michael@0 3900 dst += 4;
michael@0 3901 src += 4;
michael@0 3902 }
michael@0 3903
michael@0 3904 while (w)
michael@0 3905 {
michael@0 3906 s = *src++;
michael@0 3907 d = *dst;
michael@0 3908
michael@0 3909 *dst++ = pack_1x128_32 (
michael@0 3910 over_rev_non_pre_1x128 (
michael@0 3911 unpack_32_1x128 (s), unpack_32_1x128 (d)));
michael@0 3912
michael@0 3913 w--;
michael@0 3914 }
michael@0 3915 }
michael@0 3916
michael@0 3917 }
michael@0 3918
michael@0 3919 static void
michael@0 3920 sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
michael@0 3921 pixman_composite_info_t *info)
michael@0 3922 {
michael@0 3923 PIXMAN_COMPOSITE_ARGS (info);
michael@0 3924 uint32_t src;
michael@0 3925 uint16_t *dst_line, *dst, d;
michael@0 3926 uint32_t *mask_line, *mask, m;
michael@0 3927 int dst_stride, mask_stride;
michael@0 3928 int w;
michael@0 3929 uint32_t pack_cmp;
michael@0 3930
michael@0 3931 __m128i xmm_src, xmm_alpha;
michael@0 3932 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 3933 __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
michael@0 3934
michael@0 3935 __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
michael@0 3936
michael@0 3937 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 3938
michael@0 3939 if (src == 0)
michael@0 3940 return;
michael@0 3941
michael@0 3942 PIXMAN_IMAGE_GET_LINE (
michael@0 3943 dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
michael@0 3944 PIXMAN_IMAGE_GET_LINE (
michael@0 3945 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
michael@0 3946
michael@0 3947 xmm_src = expand_pixel_32_1x128 (src);
michael@0 3948 xmm_alpha = expand_alpha_1x128 (xmm_src);
michael@0 3949 mmx_src = xmm_src;
michael@0 3950 mmx_alpha = xmm_alpha;
michael@0 3951
michael@0 3952 while (height--)
michael@0 3953 {
michael@0 3954 w = width;
michael@0 3955 mask = mask_line;
michael@0 3956 dst = dst_line;
michael@0 3957 mask_line += mask_stride;
michael@0 3958 dst_line += dst_stride;
michael@0 3959
michael@0 3960 while (w && ((uintptr_t)dst & 15))
michael@0 3961 {
michael@0 3962 m = *(uint32_t *) mask;
michael@0 3963
michael@0 3964 if (m)
michael@0 3965 {
michael@0 3966 d = *dst;
michael@0 3967 mmx_mask = unpack_32_1x128 (m);
michael@0 3968 mmx_dest = expand565_16_1x128 (d);
michael@0 3969
michael@0 3970 *dst = pack_565_32_16 (
michael@0 3971 pack_1x128_32 (
michael@0 3972 in_over_1x128 (
michael@0 3973 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
michael@0 3974 }
michael@0 3975
michael@0 3976 w--;
michael@0 3977 dst++;
michael@0 3978 mask++;
michael@0 3979 }
michael@0 3980
michael@0 3981 while (w >= 8)
michael@0 3982 {
michael@0 3983 /* First round */
michael@0 3984 xmm_mask = load_128_unaligned ((__m128i*)mask);
michael@0 3985 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 3986
michael@0 3987 pack_cmp = _mm_movemask_epi8 (
michael@0 3988 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
michael@0 3989
michael@0 3990 unpack_565_128_4x128 (xmm_dst,
michael@0 3991 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
michael@0 3992 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 3993
michael@0 3994 /* preload next round */
michael@0 3995 xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
michael@0 3996
michael@0 3997 /* preload next round */
michael@0 3998 if (pack_cmp != 0xffff)
michael@0 3999 {
michael@0 4000 in_over_2x128 (&xmm_src, &xmm_src,
michael@0 4001 &xmm_alpha, &xmm_alpha,
michael@0 4002 &xmm_mask_lo, &xmm_mask_hi,
michael@0 4003 &xmm_dst0, &xmm_dst1);
michael@0 4004 }
michael@0 4005
michael@0 4006 /* Second round */
michael@0 4007 pack_cmp = _mm_movemask_epi8 (
michael@0 4008 _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
michael@0 4009
michael@0 4010 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 4011
michael@0 4012 if (pack_cmp != 0xffff)
michael@0 4013 {
michael@0 4014 in_over_2x128 (&xmm_src, &xmm_src,
michael@0 4015 &xmm_alpha, &xmm_alpha,
michael@0 4016 &xmm_mask_lo, &xmm_mask_hi,
michael@0 4017 &xmm_dst2, &xmm_dst3);
michael@0 4018 }
michael@0 4019
michael@0 4020 save_128_aligned (
michael@0 4021 (__m128i*)dst, pack_565_4x128_128 (
michael@0 4022 &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
michael@0 4023
michael@0 4024 w -= 8;
michael@0 4025 dst += 8;
michael@0 4026 mask += 8;
michael@0 4027 }
michael@0 4028
michael@0 4029 while (w)
michael@0 4030 {
michael@0 4031 m = *(uint32_t *) mask;
michael@0 4032
michael@0 4033 if (m)
michael@0 4034 {
michael@0 4035 d = *dst;
michael@0 4036 mmx_mask = unpack_32_1x128 (m);
michael@0 4037 mmx_dest = expand565_16_1x128 (d);
michael@0 4038
michael@0 4039 *dst = pack_565_32_16 (
michael@0 4040 pack_1x128_32 (
michael@0 4041 in_over_1x128 (
michael@0 4042 &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
michael@0 4043 }
michael@0 4044
michael@0 4045 w--;
michael@0 4046 dst++;
michael@0 4047 mask++;
michael@0 4048 }
michael@0 4049 }
michael@0 4050
michael@0 4051 }
michael@0 4052
michael@0 4053 static void
michael@0 4054 sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
michael@0 4055 pixman_composite_info_t *info)
michael@0 4056 {
michael@0 4057 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4058 uint8_t *dst_line, *dst;
michael@0 4059 uint8_t *mask_line, *mask;
michael@0 4060 int dst_stride, mask_stride;
michael@0 4061 uint32_t d, m;
michael@0 4062 uint32_t src;
michael@0 4063 int32_t w;
michael@0 4064
michael@0 4065 __m128i xmm_alpha;
michael@0 4066 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 4067 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 4068
michael@0 4069 PIXMAN_IMAGE_GET_LINE (
michael@0 4070 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 4071 PIXMAN_IMAGE_GET_LINE (
michael@0 4072 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 4073
michael@0 4074 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 4075
michael@0 4076 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
michael@0 4077
michael@0 4078 while (height--)
michael@0 4079 {
michael@0 4080 dst = dst_line;
michael@0 4081 dst_line += dst_stride;
michael@0 4082 mask = mask_line;
michael@0 4083 mask_line += mask_stride;
michael@0 4084 w = width;
michael@0 4085
michael@0 4086 while (w && ((uintptr_t)dst & 15))
michael@0 4087 {
michael@0 4088 m = (uint32_t) *mask++;
michael@0 4089 d = (uint32_t) *dst;
michael@0 4090
michael@0 4091 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4092 pix_multiply_1x128 (
michael@0 4093 pix_multiply_1x128 (xmm_alpha,
michael@0 4094 unpack_32_1x128 (m)),
michael@0 4095 unpack_32_1x128 (d)));
michael@0 4096 w--;
michael@0 4097 }
michael@0 4098
michael@0 4099 while (w >= 16)
michael@0 4100 {
michael@0 4101 xmm_mask = load_128_unaligned ((__m128i*)mask);
michael@0 4102 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 4103
michael@0 4104 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 4105 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 4106
michael@0 4107 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
michael@0 4108 &xmm_mask_lo, &xmm_mask_hi,
michael@0 4109 &xmm_mask_lo, &xmm_mask_hi);
michael@0 4110
michael@0 4111 pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
michael@0 4112 &xmm_dst_lo, &xmm_dst_hi,
michael@0 4113 &xmm_dst_lo, &xmm_dst_hi);
michael@0 4114
michael@0 4115 save_128_aligned (
michael@0 4116 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 4117
michael@0 4118 mask += 16;
michael@0 4119 dst += 16;
michael@0 4120 w -= 16;
michael@0 4121 }
michael@0 4122
michael@0 4123 while (w)
michael@0 4124 {
michael@0 4125 m = (uint32_t) *mask++;
michael@0 4126 d = (uint32_t) *dst;
michael@0 4127
michael@0 4128 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4129 pix_multiply_1x128 (
michael@0 4130 pix_multiply_1x128 (
michael@0 4131 xmm_alpha, unpack_32_1x128 (m)),
michael@0 4132 unpack_32_1x128 (d)));
michael@0 4133 w--;
michael@0 4134 }
michael@0 4135 }
michael@0 4136
michael@0 4137 }
michael@0 4138
michael@0 4139 static void
michael@0 4140 sse2_composite_in_n_8 (pixman_implementation_t *imp,
michael@0 4141 pixman_composite_info_t *info)
michael@0 4142 {
michael@0 4143 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4144 uint8_t *dst_line, *dst;
michael@0 4145 int dst_stride;
michael@0 4146 uint32_t d;
michael@0 4147 uint32_t src;
michael@0 4148 int32_t w;
michael@0 4149
michael@0 4150 __m128i xmm_alpha;
michael@0 4151 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 4152
michael@0 4153 PIXMAN_IMAGE_GET_LINE (
michael@0 4154 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 4155
michael@0 4156 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 4157
michael@0 4158 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
michael@0 4159
michael@0 4160 src = src >> 24;
michael@0 4161
michael@0 4162 if (src == 0xff)
michael@0 4163 return;
michael@0 4164
michael@0 4165 if (src == 0x00)
michael@0 4166 {
michael@0 4167 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
michael@0 4168 8, dest_x, dest_y, width, height, src);
michael@0 4169
michael@0 4170 return;
michael@0 4171 }
michael@0 4172
michael@0 4173 while (height--)
michael@0 4174 {
michael@0 4175 dst = dst_line;
michael@0 4176 dst_line += dst_stride;
michael@0 4177 w = width;
michael@0 4178
michael@0 4179 while (w && ((uintptr_t)dst & 15))
michael@0 4180 {
michael@0 4181 d = (uint32_t) *dst;
michael@0 4182
michael@0 4183 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4184 pix_multiply_1x128 (
michael@0 4185 xmm_alpha,
michael@0 4186 unpack_32_1x128 (d)));
michael@0 4187 w--;
michael@0 4188 }
michael@0 4189
michael@0 4190 while (w >= 16)
michael@0 4191 {
michael@0 4192 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 4193
michael@0 4194 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 4195
michael@0 4196 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
michael@0 4197 &xmm_dst_lo, &xmm_dst_hi,
michael@0 4198 &xmm_dst_lo, &xmm_dst_hi);
michael@0 4199
michael@0 4200 save_128_aligned (
michael@0 4201 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 4202
michael@0 4203 dst += 16;
michael@0 4204 w -= 16;
michael@0 4205 }
michael@0 4206
michael@0 4207 while (w)
michael@0 4208 {
michael@0 4209 d = (uint32_t) *dst;
michael@0 4210
michael@0 4211 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4212 pix_multiply_1x128 (
michael@0 4213 xmm_alpha,
michael@0 4214 unpack_32_1x128 (d)));
michael@0 4215 w--;
michael@0 4216 }
michael@0 4217 }
michael@0 4218
michael@0 4219 }
michael@0 4220
michael@0 4221 static void
michael@0 4222 sse2_composite_in_8_8 (pixman_implementation_t *imp,
michael@0 4223 pixman_composite_info_t *info)
michael@0 4224 {
michael@0 4225 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4226 uint8_t *dst_line, *dst;
michael@0 4227 uint8_t *src_line, *src;
michael@0 4228 int src_stride, dst_stride;
michael@0 4229 int32_t w;
michael@0 4230 uint32_t s, d;
michael@0 4231
michael@0 4232 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 4233 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 4234
michael@0 4235 PIXMAN_IMAGE_GET_LINE (
michael@0 4236 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 4237 PIXMAN_IMAGE_GET_LINE (
michael@0 4238 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
michael@0 4239
michael@0 4240 while (height--)
michael@0 4241 {
michael@0 4242 dst = dst_line;
michael@0 4243 dst_line += dst_stride;
michael@0 4244 src = src_line;
michael@0 4245 src_line += src_stride;
michael@0 4246 w = width;
michael@0 4247
michael@0 4248 while (w && ((uintptr_t)dst & 15))
michael@0 4249 {
michael@0 4250 s = (uint32_t) *src++;
michael@0 4251 d = (uint32_t) *dst;
michael@0 4252
michael@0 4253 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4254 pix_multiply_1x128 (
michael@0 4255 unpack_32_1x128 (s), unpack_32_1x128 (d)));
michael@0 4256 w--;
michael@0 4257 }
michael@0 4258
michael@0 4259 while (w >= 16)
michael@0 4260 {
michael@0 4261 xmm_src = load_128_unaligned ((__m128i*)src);
michael@0 4262 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 4263
michael@0 4264 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 4265 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 4266
michael@0 4267 pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 4268 &xmm_dst_lo, &xmm_dst_hi,
michael@0 4269 &xmm_dst_lo, &xmm_dst_hi);
michael@0 4270
michael@0 4271 save_128_aligned (
michael@0 4272 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 4273
michael@0 4274 src += 16;
michael@0 4275 dst += 16;
michael@0 4276 w -= 16;
michael@0 4277 }
michael@0 4278
michael@0 4279 while (w)
michael@0 4280 {
michael@0 4281 s = (uint32_t) *src++;
michael@0 4282 d = (uint32_t) *dst;
michael@0 4283
michael@0 4284 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4285 pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
michael@0 4286 w--;
michael@0 4287 }
michael@0 4288 }
michael@0 4289
michael@0 4290 }
michael@0 4291
michael@0 4292 static void
michael@0 4293 sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
michael@0 4294 pixman_composite_info_t *info)
michael@0 4295 {
michael@0 4296 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4297 uint8_t *dst_line, *dst;
michael@0 4298 uint8_t *mask_line, *mask;
michael@0 4299 int dst_stride, mask_stride;
michael@0 4300 int32_t w;
michael@0 4301 uint32_t src;
michael@0 4302 uint32_t m, d;
michael@0 4303
michael@0 4304 __m128i xmm_alpha;
michael@0 4305 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 4306 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 4307
michael@0 4308 PIXMAN_IMAGE_GET_LINE (
michael@0 4309 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 4310 PIXMAN_IMAGE_GET_LINE (
michael@0 4311 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 4312
michael@0 4313 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 4314
michael@0 4315 xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
michael@0 4316
michael@0 4317 while (height--)
michael@0 4318 {
michael@0 4319 dst = dst_line;
michael@0 4320 dst_line += dst_stride;
michael@0 4321 mask = mask_line;
michael@0 4322 mask_line += mask_stride;
michael@0 4323 w = width;
michael@0 4324
michael@0 4325 while (w && ((uintptr_t)dst & 15))
michael@0 4326 {
michael@0 4327 m = (uint32_t) *mask++;
michael@0 4328 d = (uint32_t) *dst;
michael@0 4329
michael@0 4330 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4331 _mm_adds_epu16 (
michael@0 4332 pix_multiply_1x128 (
michael@0 4333 xmm_alpha, unpack_32_1x128 (m)),
michael@0 4334 unpack_32_1x128 (d)));
michael@0 4335 w--;
michael@0 4336 }
michael@0 4337
michael@0 4338 while (w >= 16)
michael@0 4339 {
michael@0 4340 xmm_mask = load_128_unaligned ((__m128i*)mask);
michael@0 4341 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 4342
michael@0 4343 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 4344 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 4345
michael@0 4346 pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
michael@0 4347 &xmm_mask_lo, &xmm_mask_hi,
michael@0 4348 &xmm_mask_lo, &xmm_mask_hi);
michael@0 4349
michael@0 4350 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
michael@0 4351 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
michael@0 4352
michael@0 4353 save_128_aligned (
michael@0 4354 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 4355
michael@0 4356 mask += 16;
michael@0 4357 dst += 16;
michael@0 4358 w -= 16;
michael@0 4359 }
michael@0 4360
michael@0 4361 while (w)
michael@0 4362 {
michael@0 4363 m = (uint32_t) *mask++;
michael@0 4364 d = (uint32_t) *dst;
michael@0 4365
michael@0 4366 *dst++ = (uint8_t) pack_1x128_32 (
michael@0 4367 _mm_adds_epu16 (
michael@0 4368 pix_multiply_1x128 (
michael@0 4369 xmm_alpha, unpack_32_1x128 (m)),
michael@0 4370 unpack_32_1x128 (d)));
michael@0 4371
michael@0 4372 w--;
michael@0 4373 }
michael@0 4374 }
michael@0 4375
michael@0 4376 }
michael@0 4377
michael@0 4378 static void
michael@0 4379 sse2_composite_add_n_8 (pixman_implementation_t *imp,
michael@0 4380 pixman_composite_info_t *info)
michael@0 4381 {
michael@0 4382 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4383 uint8_t *dst_line, *dst;
michael@0 4384 int dst_stride;
michael@0 4385 int32_t w;
michael@0 4386 uint32_t src;
michael@0 4387
michael@0 4388 __m128i xmm_src;
michael@0 4389
michael@0 4390 PIXMAN_IMAGE_GET_LINE (
michael@0 4391 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 4392
michael@0 4393 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 4394
michael@0 4395 src >>= 24;
michael@0 4396
michael@0 4397 if (src == 0x00)
michael@0 4398 return;
michael@0 4399
michael@0 4400 if (src == 0xff)
michael@0 4401 {
michael@0 4402 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
michael@0 4403 8, dest_x, dest_y, width, height, 0xff);
michael@0 4404
michael@0 4405 return;
michael@0 4406 }
michael@0 4407
michael@0 4408 src = (src << 24) | (src << 16) | (src << 8) | src;
michael@0 4409 xmm_src = _mm_set_epi32 (src, src, src, src);
michael@0 4410
michael@0 4411 while (height--)
michael@0 4412 {
michael@0 4413 dst = dst_line;
michael@0 4414 dst_line += dst_stride;
michael@0 4415 w = width;
michael@0 4416
michael@0 4417 while (w && ((uintptr_t)dst & 15))
michael@0 4418 {
michael@0 4419 *dst = (uint8_t)_mm_cvtsi128_si32 (
michael@0 4420 _mm_adds_epu8 (
michael@0 4421 xmm_src,
michael@0 4422 _mm_cvtsi32_si128 (*dst)));
michael@0 4423
michael@0 4424 w--;
michael@0 4425 dst++;
michael@0 4426 }
michael@0 4427
michael@0 4428 while (w >= 16)
michael@0 4429 {
michael@0 4430 save_128_aligned (
michael@0 4431 (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
michael@0 4432
michael@0 4433 dst += 16;
michael@0 4434 w -= 16;
michael@0 4435 }
michael@0 4436
michael@0 4437 while (w)
michael@0 4438 {
michael@0 4439 *dst = (uint8_t)_mm_cvtsi128_si32 (
michael@0 4440 _mm_adds_epu8 (
michael@0 4441 xmm_src,
michael@0 4442 _mm_cvtsi32_si128 (*dst)));
michael@0 4443
michael@0 4444 w--;
michael@0 4445 dst++;
michael@0 4446 }
michael@0 4447 }
michael@0 4448
michael@0 4449 }
michael@0 4450
michael@0 4451 static void
michael@0 4452 sse2_composite_add_8_8 (pixman_implementation_t *imp,
michael@0 4453 pixman_composite_info_t *info)
michael@0 4454 {
michael@0 4455 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4456 uint8_t *dst_line, *dst;
michael@0 4457 uint8_t *src_line, *src;
michael@0 4458 int dst_stride, src_stride;
michael@0 4459 int32_t w;
michael@0 4460 uint16_t t;
michael@0 4461
michael@0 4462 PIXMAN_IMAGE_GET_LINE (
michael@0 4463 src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
michael@0 4464 PIXMAN_IMAGE_GET_LINE (
michael@0 4465 dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
michael@0 4466
michael@0 4467 while (height--)
michael@0 4468 {
michael@0 4469 dst = dst_line;
michael@0 4470 src = src_line;
michael@0 4471
michael@0 4472 dst_line += dst_stride;
michael@0 4473 src_line += src_stride;
michael@0 4474 w = width;
michael@0 4475
michael@0 4476 /* Small head */
michael@0 4477 while (w && (uintptr_t)dst & 3)
michael@0 4478 {
michael@0 4479 t = (*dst) + (*src++);
michael@0 4480 *dst++ = t | (0 - (t >> 8));
michael@0 4481 w--;
michael@0 4482 }
michael@0 4483
michael@0 4484 sse2_combine_add_u (imp, op,
michael@0 4485 (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
michael@0 4486
michael@0 4487 /* Small tail */
michael@0 4488 dst += w & 0xfffc;
michael@0 4489 src += w & 0xfffc;
michael@0 4490
michael@0 4491 w &= 3;
michael@0 4492
michael@0 4493 while (w)
michael@0 4494 {
michael@0 4495 t = (*dst) + (*src++);
michael@0 4496 *dst++ = t | (0 - (t >> 8));
michael@0 4497 w--;
michael@0 4498 }
michael@0 4499 }
michael@0 4500
michael@0 4501 }
michael@0 4502
michael@0 4503 static void
michael@0 4504 sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
michael@0 4505 pixman_composite_info_t *info)
michael@0 4506 {
michael@0 4507 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4508 uint32_t *dst_line, *dst;
michael@0 4509 uint32_t *src_line, *src;
michael@0 4510 int dst_stride, src_stride;
michael@0 4511
michael@0 4512 PIXMAN_IMAGE_GET_LINE (
michael@0 4513 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 4514 PIXMAN_IMAGE_GET_LINE (
michael@0 4515 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 4516
michael@0 4517 while (height--)
michael@0 4518 {
michael@0 4519 dst = dst_line;
michael@0 4520 dst_line += dst_stride;
michael@0 4521 src = src_line;
michael@0 4522 src_line += src_stride;
michael@0 4523
michael@0 4524 sse2_combine_add_u (imp, op, dst, src, NULL, width);
michael@0 4525 }
michael@0 4526 }
michael@0 4527
michael@0 4528 static void
michael@0 4529 sse2_composite_add_n_8888 (pixman_implementation_t *imp,
michael@0 4530 pixman_composite_info_t *info)
michael@0 4531 {
michael@0 4532 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4533 uint32_t *dst_line, *dst, src;
michael@0 4534 int dst_stride;
michael@0 4535
michael@0 4536 __m128i xmm_src;
michael@0 4537
michael@0 4538 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 4539
michael@0 4540 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 4541 if (src == 0)
michael@0 4542 return;
michael@0 4543
michael@0 4544 if (src == ~0)
michael@0 4545 {
michael@0 4546 pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
michael@0 4547 dest_x, dest_y, width, height, ~0);
michael@0 4548
michael@0 4549 return;
michael@0 4550 }
michael@0 4551
michael@0 4552 xmm_src = _mm_set_epi32 (src, src, src, src);
michael@0 4553 while (height--)
michael@0 4554 {
michael@0 4555 int w = width;
michael@0 4556 uint32_t d;
michael@0 4557
michael@0 4558 dst = dst_line;
michael@0 4559 dst_line += dst_stride;
michael@0 4560
michael@0 4561 while (w && (unsigned long)dst & 15)
michael@0 4562 {
michael@0 4563 d = *dst;
michael@0 4564 *dst++ =
michael@0 4565 _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
michael@0 4566 w--;
michael@0 4567 }
michael@0 4568
michael@0 4569 while (w >= 4)
michael@0 4570 {
michael@0 4571 save_128_aligned
michael@0 4572 ((__m128i*)dst,
michael@0 4573 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
michael@0 4574
michael@0 4575 dst += 4;
michael@0 4576 w -= 4;
michael@0 4577 }
michael@0 4578
michael@0 4579 while (w--)
michael@0 4580 {
michael@0 4581 d = *dst;
michael@0 4582 *dst++ =
michael@0 4583 _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
michael@0 4584 _mm_cvtsi32_si128 (d)));
michael@0 4585 }
michael@0 4586 }
michael@0 4587 }
michael@0 4588
michael@0 4589 static void
michael@0 4590 sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
michael@0 4591 pixman_composite_info_t *info)
michael@0 4592 {
michael@0 4593 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4594 uint32_t *dst_line, *dst;
michael@0 4595 uint8_t *mask_line, *mask;
michael@0 4596 int dst_stride, mask_stride;
michael@0 4597 int32_t w;
michael@0 4598 uint32_t src;
michael@0 4599
michael@0 4600 __m128i xmm_src;
michael@0 4601
michael@0 4602 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 4603 if (src == 0)
michael@0 4604 return;
michael@0 4605 xmm_src = expand_pixel_32_1x128 (src);
michael@0 4606
michael@0 4607 PIXMAN_IMAGE_GET_LINE (
michael@0 4608 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 4609 PIXMAN_IMAGE_GET_LINE (
michael@0 4610 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 4611
michael@0 4612 while (height--)
michael@0 4613 {
michael@0 4614 dst = dst_line;
michael@0 4615 dst_line += dst_stride;
michael@0 4616 mask = mask_line;
michael@0 4617 mask_line += mask_stride;
michael@0 4618 w = width;
michael@0 4619
michael@0 4620 while (w && ((unsigned long)dst & 15))
michael@0 4621 {
michael@0 4622 uint8_t m = *mask++;
michael@0 4623 if (m)
michael@0 4624 {
michael@0 4625 *dst = pack_1x128_32
michael@0 4626 (_mm_adds_epu16
michael@0 4627 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
michael@0 4628 unpack_32_1x128 (*dst)));
michael@0 4629 }
michael@0 4630 dst++;
michael@0 4631 w--;
michael@0 4632 }
michael@0 4633
michael@0 4634 while (w >= 4)
michael@0 4635 {
michael@0 4636 uint32_t m = *(uint32_t*)mask;
michael@0 4637 if (m)
michael@0 4638 {
michael@0 4639 __m128i xmm_mask_lo, xmm_mask_hi;
michael@0 4640 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 4641
michael@0 4642 __m128i xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 4643 __m128i xmm_mask =
michael@0 4644 _mm_unpacklo_epi8 (unpack_32_1x128(m),
michael@0 4645 _mm_setzero_si128 ());
michael@0 4646
michael@0 4647 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 4648 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 4649
michael@0 4650 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
michael@0 4651 &xmm_mask_lo, &xmm_mask_hi);
michael@0 4652
michael@0 4653 pix_multiply_2x128 (&xmm_src, &xmm_src,
michael@0 4654 &xmm_mask_lo, &xmm_mask_hi,
michael@0 4655 &xmm_mask_lo, &xmm_mask_hi);
michael@0 4656
michael@0 4657 xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
michael@0 4658 xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
michael@0 4659
michael@0 4660 save_128_aligned (
michael@0 4661 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 4662 }
michael@0 4663
michael@0 4664 w -= 4;
michael@0 4665 dst += 4;
michael@0 4666 mask += 4;
michael@0 4667 }
michael@0 4668
michael@0 4669 while (w)
michael@0 4670 {
michael@0 4671 uint8_t m = *mask++;
michael@0 4672 if (m)
michael@0 4673 {
michael@0 4674 *dst = pack_1x128_32
michael@0 4675 (_mm_adds_epu16
michael@0 4676 (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
michael@0 4677 unpack_32_1x128 (*dst)));
michael@0 4678 }
michael@0 4679 dst++;
michael@0 4680 w--;
michael@0 4681 }
michael@0 4682 }
michael@0 4683 }
michael@0 4684
michael@0 4685 static pixman_bool_t
michael@0 4686 sse2_blt (pixman_implementation_t *imp,
michael@0 4687 uint32_t * src_bits,
michael@0 4688 uint32_t * dst_bits,
michael@0 4689 int src_stride,
michael@0 4690 int dst_stride,
michael@0 4691 int src_bpp,
michael@0 4692 int dst_bpp,
michael@0 4693 int src_x,
michael@0 4694 int src_y,
michael@0 4695 int dest_x,
michael@0 4696 int dest_y,
michael@0 4697 int width,
michael@0 4698 int height)
michael@0 4699 {
michael@0 4700 uint8_t * src_bytes;
michael@0 4701 uint8_t * dst_bytes;
michael@0 4702 int byte_width;
michael@0 4703
michael@0 4704 if (src_bpp != dst_bpp)
michael@0 4705 return FALSE;
michael@0 4706
michael@0 4707 if (src_bpp == 16)
michael@0 4708 {
michael@0 4709 src_stride = src_stride * (int) sizeof (uint32_t) / 2;
michael@0 4710 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
michael@0 4711 src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
michael@0 4712 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
michael@0 4713 byte_width = 2 * width;
michael@0 4714 src_stride *= 2;
michael@0 4715 dst_stride *= 2;
michael@0 4716 }
michael@0 4717 else if (src_bpp == 32)
michael@0 4718 {
michael@0 4719 src_stride = src_stride * (int) sizeof (uint32_t) / 4;
michael@0 4720 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
michael@0 4721 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
michael@0 4722 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
michael@0 4723 byte_width = 4 * width;
michael@0 4724 src_stride *= 4;
michael@0 4725 dst_stride *= 4;
michael@0 4726 }
michael@0 4727 else
michael@0 4728 {
michael@0 4729 return FALSE;
michael@0 4730 }
michael@0 4731
michael@0 4732 while (height--)
michael@0 4733 {
michael@0 4734 int w;
michael@0 4735 uint8_t *s = src_bytes;
michael@0 4736 uint8_t *d = dst_bytes;
michael@0 4737 src_bytes += src_stride;
michael@0 4738 dst_bytes += dst_stride;
michael@0 4739 w = byte_width;
michael@0 4740
michael@0 4741 while (w >= 2 && ((uintptr_t)d & 3))
michael@0 4742 {
michael@0 4743 *(uint16_t *)d = *(uint16_t *)s;
michael@0 4744 w -= 2;
michael@0 4745 s += 2;
michael@0 4746 d += 2;
michael@0 4747 }
michael@0 4748
michael@0 4749 while (w >= 4 && ((uintptr_t)d & 15))
michael@0 4750 {
michael@0 4751 *(uint32_t *)d = *(uint32_t *)s;
michael@0 4752
michael@0 4753 w -= 4;
michael@0 4754 s += 4;
michael@0 4755 d += 4;
michael@0 4756 }
michael@0 4757
michael@0 4758 while (w >= 64)
michael@0 4759 {
michael@0 4760 __m128i xmm0, xmm1, xmm2, xmm3;
michael@0 4761
michael@0 4762 xmm0 = load_128_unaligned ((__m128i*)(s));
michael@0 4763 xmm1 = load_128_unaligned ((__m128i*)(s + 16));
michael@0 4764 xmm2 = load_128_unaligned ((__m128i*)(s + 32));
michael@0 4765 xmm3 = load_128_unaligned ((__m128i*)(s + 48));
michael@0 4766
michael@0 4767 save_128_aligned ((__m128i*)(d), xmm0);
michael@0 4768 save_128_aligned ((__m128i*)(d + 16), xmm1);
michael@0 4769 save_128_aligned ((__m128i*)(d + 32), xmm2);
michael@0 4770 save_128_aligned ((__m128i*)(d + 48), xmm3);
michael@0 4771
michael@0 4772 s += 64;
michael@0 4773 d += 64;
michael@0 4774 w -= 64;
michael@0 4775 }
michael@0 4776
michael@0 4777 while (w >= 16)
michael@0 4778 {
michael@0 4779 save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
michael@0 4780
michael@0 4781 w -= 16;
michael@0 4782 d += 16;
michael@0 4783 s += 16;
michael@0 4784 }
michael@0 4785
michael@0 4786 while (w >= 4)
michael@0 4787 {
michael@0 4788 *(uint32_t *)d = *(uint32_t *)s;
michael@0 4789
michael@0 4790 w -= 4;
michael@0 4791 s += 4;
michael@0 4792 d += 4;
michael@0 4793 }
michael@0 4794
michael@0 4795 if (w >= 2)
michael@0 4796 {
michael@0 4797 *(uint16_t *)d = *(uint16_t *)s;
michael@0 4798 w -= 2;
michael@0 4799 s += 2;
michael@0 4800 d += 2;
michael@0 4801 }
michael@0 4802 }
michael@0 4803
michael@0 4804 return TRUE;
michael@0 4805 }
michael@0 4806
michael@0 4807 static void
michael@0 4808 sse2_composite_copy_area (pixman_implementation_t *imp,
michael@0 4809 pixman_composite_info_t *info)
michael@0 4810 {
michael@0 4811 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4812 sse2_blt (imp, src_image->bits.bits,
michael@0 4813 dest_image->bits.bits,
michael@0 4814 src_image->bits.rowstride,
michael@0 4815 dest_image->bits.rowstride,
michael@0 4816 PIXMAN_FORMAT_BPP (src_image->bits.format),
michael@0 4817 PIXMAN_FORMAT_BPP (dest_image->bits.format),
michael@0 4818 src_x, src_y, dest_x, dest_y, width, height);
michael@0 4819 }
michael@0 4820
michael@0 4821 static void
michael@0 4822 sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
michael@0 4823 pixman_composite_info_t *info)
michael@0 4824 {
michael@0 4825 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4826 uint32_t *src, *src_line, s;
michael@0 4827 uint32_t *dst, *dst_line, d;
michael@0 4828 uint8_t *mask, *mask_line;
michael@0 4829 uint32_t m;
michael@0 4830 int src_stride, mask_stride, dst_stride;
michael@0 4831 int32_t w;
michael@0 4832 __m128i ms;
michael@0 4833
michael@0 4834 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 4835 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 4836 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 4837
michael@0 4838 PIXMAN_IMAGE_GET_LINE (
michael@0 4839 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 4840 PIXMAN_IMAGE_GET_LINE (
michael@0 4841 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 4842 PIXMAN_IMAGE_GET_LINE (
michael@0 4843 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 4844
michael@0 4845 while (height--)
michael@0 4846 {
michael@0 4847 src = src_line;
michael@0 4848 src_line += src_stride;
michael@0 4849 dst = dst_line;
michael@0 4850 dst_line += dst_stride;
michael@0 4851 mask = mask_line;
michael@0 4852 mask_line += mask_stride;
michael@0 4853
michael@0 4854 w = width;
michael@0 4855
michael@0 4856 while (w && (uintptr_t)dst & 15)
michael@0 4857 {
michael@0 4858 s = 0xff000000 | *src++;
michael@0 4859 m = (uint32_t) *mask++;
michael@0 4860 d = *dst;
michael@0 4861 ms = unpack_32_1x128 (s);
michael@0 4862
michael@0 4863 if (m != 0xff)
michael@0 4864 {
michael@0 4865 __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
michael@0 4866 __m128i md = unpack_32_1x128 (d);
michael@0 4867
michael@0 4868 ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
michael@0 4869 }
michael@0 4870
michael@0 4871 *dst++ = pack_1x128_32 (ms);
michael@0 4872 w--;
michael@0 4873 }
michael@0 4874
michael@0 4875 while (w >= 4)
michael@0 4876 {
michael@0 4877 m = *(uint32_t*) mask;
michael@0 4878 xmm_src = _mm_or_si128 (
michael@0 4879 load_128_unaligned ((__m128i*)src), mask_ff000000);
michael@0 4880
michael@0 4881 if (m == 0xffffffff)
michael@0 4882 {
michael@0 4883 save_128_aligned ((__m128i*)dst, xmm_src);
michael@0 4884 }
michael@0 4885 else
michael@0 4886 {
michael@0 4887 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 4888
michael@0 4889 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
michael@0 4890
michael@0 4891 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 4892 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 4893 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 4894
michael@0 4895 expand_alpha_rev_2x128 (
michael@0 4896 xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 4897
michael@0 4898 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 4899 &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
michael@0 4900 &xmm_dst_lo, &xmm_dst_hi);
michael@0 4901
michael@0 4902 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 4903 }
michael@0 4904
michael@0 4905 src += 4;
michael@0 4906 dst += 4;
michael@0 4907 mask += 4;
michael@0 4908 w -= 4;
michael@0 4909 }
michael@0 4910
michael@0 4911 while (w)
michael@0 4912 {
michael@0 4913 m = (uint32_t) *mask++;
michael@0 4914
michael@0 4915 if (m)
michael@0 4916 {
michael@0 4917 s = 0xff000000 | *src;
michael@0 4918
michael@0 4919 if (m == 0xff)
michael@0 4920 {
michael@0 4921 *dst = s;
michael@0 4922 }
michael@0 4923 else
michael@0 4924 {
michael@0 4925 __m128i ma, md, ms;
michael@0 4926
michael@0 4927 d = *dst;
michael@0 4928
michael@0 4929 ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
michael@0 4930 md = unpack_32_1x128 (d);
michael@0 4931 ms = unpack_32_1x128 (s);
michael@0 4932
michael@0 4933 *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
michael@0 4934 }
michael@0 4935
michael@0 4936 }
michael@0 4937
michael@0 4938 src++;
michael@0 4939 dst++;
michael@0 4940 w--;
michael@0 4941 }
michael@0 4942 }
michael@0 4943
michael@0 4944 }
michael@0 4945
michael@0 4946 static void
michael@0 4947 sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
michael@0 4948 pixman_composite_info_t *info)
michael@0 4949 {
michael@0 4950 PIXMAN_COMPOSITE_ARGS (info);
michael@0 4951 uint32_t *src, *src_line, s;
michael@0 4952 uint32_t *dst, *dst_line, d;
michael@0 4953 uint8_t *mask, *mask_line;
michael@0 4954 uint32_t m;
michael@0 4955 int src_stride, mask_stride, dst_stride;
michael@0 4956 int32_t w;
michael@0 4957
michael@0 4958 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
michael@0 4959 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 4960 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 4961
michael@0 4962 PIXMAN_IMAGE_GET_LINE (
michael@0 4963 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 4964 PIXMAN_IMAGE_GET_LINE (
michael@0 4965 mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
michael@0 4966 PIXMAN_IMAGE_GET_LINE (
michael@0 4967 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 4968
michael@0 4969 while (height--)
michael@0 4970 {
michael@0 4971 src = src_line;
michael@0 4972 src_line += src_stride;
michael@0 4973 dst = dst_line;
michael@0 4974 dst_line += dst_stride;
michael@0 4975 mask = mask_line;
michael@0 4976 mask_line += mask_stride;
michael@0 4977
michael@0 4978 w = width;
michael@0 4979
michael@0 4980 while (w && (uintptr_t)dst & 15)
michael@0 4981 {
michael@0 4982 uint32_t sa;
michael@0 4983
michael@0 4984 s = *src++;
michael@0 4985 m = (uint32_t) *mask++;
michael@0 4986 d = *dst;
michael@0 4987
michael@0 4988 sa = s >> 24;
michael@0 4989
michael@0 4990 if (m)
michael@0 4991 {
michael@0 4992 if (sa == 0xff && m == 0xff)
michael@0 4993 {
michael@0 4994 *dst = s;
michael@0 4995 }
michael@0 4996 else
michael@0 4997 {
michael@0 4998 __m128i ms, md, ma, msa;
michael@0 4999
michael@0 5000 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
michael@0 5001 ms = unpack_32_1x128 (s);
michael@0 5002 md = unpack_32_1x128 (d);
michael@0 5003
michael@0 5004 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
michael@0 5005
michael@0 5006 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
michael@0 5007 }
michael@0 5008 }
michael@0 5009
michael@0 5010 dst++;
michael@0 5011 w--;
michael@0 5012 }
michael@0 5013
michael@0 5014 while (w >= 4)
michael@0 5015 {
michael@0 5016 m = *(uint32_t *) mask;
michael@0 5017
michael@0 5018 if (m)
michael@0 5019 {
michael@0 5020 xmm_src = load_128_unaligned ((__m128i*)src);
michael@0 5021
michael@0 5022 if (m == 0xffffffff && is_opaque (xmm_src))
michael@0 5023 {
michael@0 5024 save_128_aligned ((__m128i *)dst, xmm_src);
michael@0 5025 }
michael@0 5026 else
michael@0 5027 {
michael@0 5028 xmm_dst = load_128_aligned ((__m128i *)dst);
michael@0 5029
michael@0 5030 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
michael@0 5031
michael@0 5032 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 5033 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 5034 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5035
michael@0 5036 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
michael@0 5037 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 5038
michael@0 5039 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
michael@0 5040 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5041
michael@0 5042 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 5043 }
michael@0 5044 }
michael@0 5045
michael@0 5046 src += 4;
michael@0 5047 dst += 4;
michael@0 5048 mask += 4;
michael@0 5049 w -= 4;
michael@0 5050 }
michael@0 5051
michael@0 5052 while (w)
michael@0 5053 {
michael@0 5054 uint32_t sa;
michael@0 5055
michael@0 5056 s = *src++;
michael@0 5057 m = (uint32_t) *mask++;
michael@0 5058 d = *dst;
michael@0 5059
michael@0 5060 sa = s >> 24;
michael@0 5061
michael@0 5062 if (m)
michael@0 5063 {
michael@0 5064 if (sa == 0xff && m == 0xff)
michael@0 5065 {
michael@0 5066 *dst = s;
michael@0 5067 }
michael@0 5068 else
michael@0 5069 {
michael@0 5070 __m128i ms, md, ma, msa;
michael@0 5071
michael@0 5072 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
michael@0 5073 ms = unpack_32_1x128 (s);
michael@0 5074 md = unpack_32_1x128 (d);
michael@0 5075
michael@0 5076 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
michael@0 5077
michael@0 5078 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
michael@0 5079 }
michael@0 5080 }
michael@0 5081
michael@0 5082 dst++;
michael@0 5083 w--;
michael@0 5084 }
michael@0 5085 }
michael@0 5086
michael@0 5087 }
michael@0 5088
michael@0 5089 static void
michael@0 5090 sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
michael@0 5091 pixman_composite_info_t *info)
michael@0 5092 {
michael@0 5093 PIXMAN_COMPOSITE_ARGS (info);
michael@0 5094 uint32_t src;
michael@0 5095 uint32_t *dst_line, *dst;
michael@0 5096 __m128i xmm_src;
michael@0 5097 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 5098 __m128i xmm_dsta_hi, xmm_dsta_lo;
michael@0 5099 int dst_stride;
michael@0 5100 int32_t w;
michael@0 5101
michael@0 5102 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
michael@0 5103
michael@0 5104 if (src == 0)
michael@0 5105 return;
michael@0 5106
michael@0 5107 PIXMAN_IMAGE_GET_LINE (
michael@0 5108 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 5109
michael@0 5110 xmm_src = expand_pixel_32_1x128 (src);
michael@0 5111
michael@0 5112 while (height--)
michael@0 5113 {
michael@0 5114 dst = dst_line;
michael@0 5115
michael@0 5116 dst_line += dst_stride;
michael@0 5117 w = width;
michael@0 5118
michael@0 5119 while (w && (uintptr_t)dst & 15)
michael@0 5120 {
michael@0 5121 __m128i vd;
michael@0 5122
michael@0 5123 vd = unpack_32_1x128 (*dst);
michael@0 5124
michael@0 5125 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
michael@0 5126 xmm_src));
michael@0 5127 w--;
michael@0 5128 dst++;
michael@0 5129 }
michael@0 5130
michael@0 5131 while (w >= 4)
michael@0 5132 {
michael@0 5133 __m128i tmp_lo, tmp_hi;
michael@0 5134
michael@0 5135 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 5136
michael@0 5137 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5138 expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
michael@0 5139
michael@0 5140 tmp_lo = xmm_src;
michael@0 5141 tmp_hi = xmm_src;
michael@0 5142
michael@0 5143 over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
michael@0 5144 &xmm_dsta_lo, &xmm_dsta_hi,
michael@0 5145 &tmp_lo, &tmp_hi);
michael@0 5146
michael@0 5147 save_128_aligned (
michael@0 5148 (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
michael@0 5149
michael@0 5150 w -= 4;
michael@0 5151 dst += 4;
michael@0 5152 }
michael@0 5153
michael@0 5154 while (w)
michael@0 5155 {
michael@0 5156 __m128i vd;
michael@0 5157
michael@0 5158 vd = unpack_32_1x128 (*dst);
michael@0 5159
michael@0 5160 *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
michael@0 5161 xmm_src));
michael@0 5162 w--;
michael@0 5163 dst++;
michael@0 5164 }
michael@0 5165
michael@0 5166 }
michael@0 5167
michael@0 5168 }
michael@0 5169
michael@0 5170 static void
michael@0 5171 sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
michael@0 5172 pixman_composite_info_t *info)
michael@0 5173 {
michael@0 5174 PIXMAN_COMPOSITE_ARGS (info);
michael@0 5175 uint32_t *src, *src_line, s;
michael@0 5176 uint32_t *dst, *dst_line, d;
michael@0 5177 uint32_t *mask, *mask_line;
michael@0 5178 uint32_t m;
michael@0 5179 int src_stride, mask_stride, dst_stride;
michael@0 5180 int32_t w;
michael@0 5181
michael@0 5182 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
michael@0 5183 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 5184 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 5185
michael@0 5186 PIXMAN_IMAGE_GET_LINE (
michael@0 5187 dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
michael@0 5188 PIXMAN_IMAGE_GET_LINE (
michael@0 5189 mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
michael@0 5190 PIXMAN_IMAGE_GET_LINE (
michael@0 5191 src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
michael@0 5192
michael@0 5193 while (height--)
michael@0 5194 {
michael@0 5195 src = src_line;
michael@0 5196 src_line += src_stride;
michael@0 5197 dst = dst_line;
michael@0 5198 dst_line += dst_stride;
michael@0 5199 mask = mask_line;
michael@0 5200 mask_line += mask_stride;
michael@0 5201
michael@0 5202 w = width;
michael@0 5203
michael@0 5204 while (w && (uintptr_t)dst & 15)
michael@0 5205 {
michael@0 5206 uint32_t sa;
michael@0 5207
michael@0 5208 s = *src++;
michael@0 5209 m = (*mask++) >> 24;
michael@0 5210 d = *dst;
michael@0 5211
michael@0 5212 sa = s >> 24;
michael@0 5213
michael@0 5214 if (m)
michael@0 5215 {
michael@0 5216 if (sa == 0xff && m == 0xff)
michael@0 5217 {
michael@0 5218 *dst = s;
michael@0 5219 }
michael@0 5220 else
michael@0 5221 {
michael@0 5222 __m128i ms, md, ma, msa;
michael@0 5223
michael@0 5224 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
michael@0 5225 ms = unpack_32_1x128 (s);
michael@0 5226 md = unpack_32_1x128 (d);
michael@0 5227
michael@0 5228 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
michael@0 5229
michael@0 5230 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
michael@0 5231 }
michael@0 5232 }
michael@0 5233
michael@0 5234 dst++;
michael@0 5235 w--;
michael@0 5236 }
michael@0 5237
michael@0 5238 while (w >= 4)
michael@0 5239 {
michael@0 5240 xmm_mask = load_128_unaligned ((__m128i*)mask);
michael@0 5241
michael@0 5242 if (!is_transparent (xmm_mask))
michael@0 5243 {
michael@0 5244 xmm_src = load_128_unaligned ((__m128i*)src);
michael@0 5245
michael@0 5246 if (is_opaque (xmm_mask) && is_opaque (xmm_src))
michael@0 5247 {
michael@0 5248 save_128_aligned ((__m128i *)dst, xmm_src);
michael@0 5249 }
michael@0 5250 else
michael@0 5251 {
michael@0 5252 xmm_dst = load_128_aligned ((__m128i *)dst);
michael@0 5253
michael@0 5254 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 5255 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 5256 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5257
michael@0 5258 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
michael@0 5259 expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 5260
michael@0 5261 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
michael@0 5262 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5263
michael@0 5264 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 5265 }
michael@0 5266 }
michael@0 5267
michael@0 5268 src += 4;
michael@0 5269 dst += 4;
michael@0 5270 mask += 4;
michael@0 5271 w -= 4;
michael@0 5272 }
michael@0 5273
michael@0 5274 while (w)
michael@0 5275 {
michael@0 5276 uint32_t sa;
michael@0 5277
michael@0 5278 s = *src++;
michael@0 5279 m = (*mask++) >> 24;
michael@0 5280 d = *dst;
michael@0 5281
michael@0 5282 sa = s >> 24;
michael@0 5283
michael@0 5284 if (m)
michael@0 5285 {
michael@0 5286 if (sa == 0xff && m == 0xff)
michael@0 5287 {
michael@0 5288 *dst = s;
michael@0 5289 }
michael@0 5290 else
michael@0 5291 {
michael@0 5292 __m128i ms, md, ma, msa;
michael@0 5293
michael@0 5294 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
michael@0 5295 ms = unpack_32_1x128 (s);
michael@0 5296 md = unpack_32_1x128 (d);
michael@0 5297
michael@0 5298 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
michael@0 5299
michael@0 5300 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
michael@0 5301 }
michael@0 5302 }
michael@0 5303
michael@0 5304 dst++;
michael@0 5305 w--;
michael@0 5306 }
michael@0 5307 }
michael@0 5308
michael@0 5309 }
michael@0 5310
michael@0 5311 /* A variant of 'sse2_combine_over_u' with minor tweaks */
michael@0 5312 static force_inline void
michael@0 5313 scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
michael@0 5314 const uint32_t* ps,
michael@0 5315 int32_t w,
michael@0 5316 pixman_fixed_t vx,
michael@0 5317 pixman_fixed_t unit_x,
michael@0 5318 pixman_fixed_t src_width_fixed,
michael@0 5319 pixman_bool_t fully_transparent_src)
michael@0 5320 {
michael@0 5321 uint32_t s, d;
michael@0 5322 const uint32_t* pm = NULL;
michael@0 5323
michael@0 5324 __m128i xmm_dst_lo, xmm_dst_hi;
michael@0 5325 __m128i xmm_src_lo, xmm_src_hi;
michael@0 5326 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 5327
michael@0 5328 if (fully_transparent_src)
michael@0 5329 return;
michael@0 5330
michael@0 5331 /* Align dst on a 16-byte boundary */
michael@0 5332 while (w && ((uintptr_t)pd & 15))
michael@0 5333 {
michael@0 5334 d = *pd;
michael@0 5335 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
michael@0 5336 vx += unit_x;
michael@0 5337 while (vx >= 0)
michael@0 5338 vx -= src_width_fixed;
michael@0 5339
michael@0 5340 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
michael@0 5341 if (pm)
michael@0 5342 pm++;
michael@0 5343 w--;
michael@0 5344 }
michael@0 5345
michael@0 5346 while (w >= 4)
michael@0 5347 {
michael@0 5348 __m128i tmp;
michael@0 5349 uint32_t tmp1, tmp2, tmp3, tmp4;
michael@0 5350
michael@0 5351 tmp1 = *(ps + pixman_fixed_to_int (vx));
michael@0 5352 vx += unit_x;
michael@0 5353 while (vx >= 0)
michael@0 5354 vx -= src_width_fixed;
michael@0 5355 tmp2 = *(ps + pixman_fixed_to_int (vx));
michael@0 5356 vx += unit_x;
michael@0 5357 while (vx >= 0)
michael@0 5358 vx -= src_width_fixed;
michael@0 5359 tmp3 = *(ps + pixman_fixed_to_int (vx));
michael@0 5360 vx += unit_x;
michael@0 5361 while (vx >= 0)
michael@0 5362 vx -= src_width_fixed;
michael@0 5363 tmp4 = *(ps + pixman_fixed_to_int (vx));
michael@0 5364 vx += unit_x;
michael@0 5365 while (vx >= 0)
michael@0 5366 vx -= src_width_fixed;
michael@0 5367
michael@0 5368 tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
michael@0 5369
michael@0 5370 xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
michael@0 5371
michael@0 5372 if (is_opaque (xmm_src_hi))
michael@0 5373 {
michael@0 5374 save_128_aligned ((__m128i*)pd, xmm_src_hi);
michael@0 5375 }
michael@0 5376 else if (!is_zero (xmm_src_hi))
michael@0 5377 {
michael@0 5378 xmm_dst_hi = load_128_aligned ((__m128i*) pd);
michael@0 5379
michael@0 5380 unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
michael@0 5381 unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5382
michael@0 5383 expand_alpha_2x128 (
michael@0 5384 xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 5385
michael@0 5386 over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 5387 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 5388 &xmm_dst_lo, &xmm_dst_hi);
michael@0 5389
michael@0 5390 /* rebuid the 4 pixel data and save*/
michael@0 5391 save_128_aligned ((__m128i*)pd,
michael@0 5392 pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 5393 }
michael@0 5394
michael@0 5395 w -= 4;
michael@0 5396 pd += 4;
michael@0 5397 if (pm)
michael@0 5398 pm += 4;
michael@0 5399 }
michael@0 5400
michael@0 5401 while (w)
michael@0 5402 {
michael@0 5403 d = *pd;
michael@0 5404 s = combine1 (ps + pixman_fixed_to_int (vx), pm);
michael@0 5405 vx += unit_x;
michael@0 5406 while (vx >= 0)
michael@0 5407 vx -= src_width_fixed;
michael@0 5408
michael@0 5409 *pd++ = core_combine_over_u_pixel_sse2 (s, d);
michael@0 5410 if (pm)
michael@0 5411 pm++;
michael@0 5412
michael@0 5413 w--;
michael@0 5414 }
michael@0 5415 }
michael@0 5416
michael@0 5417 FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
michael@0 5418 scaled_nearest_scanline_sse2_8888_8888_OVER,
michael@0 5419 uint32_t, uint32_t, COVER)
michael@0 5420 FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
michael@0 5421 scaled_nearest_scanline_sse2_8888_8888_OVER,
michael@0 5422 uint32_t, uint32_t, NONE)
michael@0 5423 FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
michael@0 5424 scaled_nearest_scanline_sse2_8888_8888_OVER,
michael@0 5425 uint32_t, uint32_t, PAD)
michael@0 5426 FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
michael@0 5427 scaled_nearest_scanline_sse2_8888_8888_OVER,
michael@0 5428 uint32_t, uint32_t, NORMAL)
michael@0 5429
michael@0 5430 static force_inline void
michael@0 5431 scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
michael@0 5432 uint32_t * dst,
michael@0 5433 const uint32_t * src,
michael@0 5434 int32_t w,
michael@0 5435 pixman_fixed_t vx,
michael@0 5436 pixman_fixed_t unit_x,
michael@0 5437 pixman_fixed_t src_width_fixed,
michael@0 5438 pixman_bool_t zero_src)
michael@0 5439 {
michael@0 5440 __m128i xmm_mask;
michael@0 5441 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 5442 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 5443 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 5444
michael@0 5445 if (zero_src || (*mask >> 24) == 0)
michael@0 5446 return;
michael@0 5447
michael@0 5448 xmm_mask = create_mask_16_128 (*mask >> 24);
michael@0 5449
michael@0 5450 while (w && (uintptr_t)dst & 15)
michael@0 5451 {
michael@0 5452 uint32_t s = *(src + pixman_fixed_to_int (vx));
michael@0 5453 vx += unit_x;
michael@0 5454 while (vx >= 0)
michael@0 5455 vx -= src_width_fixed;
michael@0 5456
michael@0 5457 if (s)
michael@0 5458 {
michael@0 5459 uint32_t d = *dst;
michael@0 5460
michael@0 5461 __m128i ms = unpack_32_1x128 (s);
michael@0 5462 __m128i alpha = expand_alpha_1x128 (ms);
michael@0 5463 __m128i dest = xmm_mask;
michael@0 5464 __m128i alpha_dst = unpack_32_1x128 (d);
michael@0 5465
michael@0 5466 *dst = pack_1x128_32 (
michael@0 5467 in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
michael@0 5468 }
michael@0 5469 dst++;
michael@0 5470 w--;
michael@0 5471 }
michael@0 5472
michael@0 5473 while (w >= 4)
michael@0 5474 {
michael@0 5475 uint32_t tmp1, tmp2, tmp3, tmp4;
michael@0 5476
michael@0 5477 tmp1 = *(src + pixman_fixed_to_int (vx));
michael@0 5478 vx += unit_x;
michael@0 5479 while (vx >= 0)
michael@0 5480 vx -= src_width_fixed;
michael@0 5481 tmp2 = *(src + pixman_fixed_to_int (vx));
michael@0 5482 vx += unit_x;
michael@0 5483 while (vx >= 0)
michael@0 5484 vx -= src_width_fixed;
michael@0 5485 tmp3 = *(src + pixman_fixed_to_int (vx));
michael@0 5486 vx += unit_x;
michael@0 5487 while (vx >= 0)
michael@0 5488 vx -= src_width_fixed;
michael@0 5489 tmp4 = *(src + pixman_fixed_to_int (vx));
michael@0 5490 vx += unit_x;
michael@0 5491 while (vx >= 0)
michael@0 5492 vx -= src_width_fixed;
michael@0 5493
michael@0 5494 xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
michael@0 5495
michael@0 5496 if (!is_zero (xmm_src))
michael@0 5497 {
michael@0 5498 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 5499
michael@0 5500 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 5501 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5502 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 5503 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 5504
michael@0 5505 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 5506 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 5507 &xmm_mask, &xmm_mask,
michael@0 5508 &xmm_dst_lo, &xmm_dst_hi);
michael@0 5509
michael@0 5510 save_128_aligned (
michael@0 5511 (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 5512 }
michael@0 5513
michael@0 5514 dst += 4;
michael@0 5515 w -= 4;
michael@0 5516 }
michael@0 5517
michael@0 5518 while (w)
michael@0 5519 {
michael@0 5520 uint32_t s = *(src + pixman_fixed_to_int (vx));
michael@0 5521 vx += unit_x;
michael@0 5522 while (vx >= 0)
michael@0 5523 vx -= src_width_fixed;
michael@0 5524
michael@0 5525 if (s)
michael@0 5526 {
michael@0 5527 uint32_t d = *dst;
michael@0 5528
michael@0 5529 __m128i ms = unpack_32_1x128 (s);
michael@0 5530 __m128i alpha = expand_alpha_1x128 (ms);
michael@0 5531 __m128i mask = xmm_mask;
michael@0 5532 __m128i dest = unpack_32_1x128 (d);
michael@0 5533
michael@0 5534 *dst = pack_1x128_32 (
michael@0 5535 in_over_1x128 (&ms, &alpha, &mask, &dest));
michael@0 5536 }
michael@0 5537
michael@0 5538 dst++;
michael@0 5539 w--;
michael@0 5540 }
michael@0 5541
michael@0 5542 }
michael@0 5543
michael@0 5544 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
michael@0 5545 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
michael@0 5546 uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
michael@0 5547 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
michael@0 5548 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
michael@0 5549 uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
michael@0 5550 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
michael@0 5551 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
michael@0 5552 uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
michael@0 5553 FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
michael@0 5554 scaled_nearest_scanline_sse2_8888_n_8888_OVER,
michael@0 5555 uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
michael@0 5556
michael@0 5557 #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1)
michael@0 5558
michael@0 5559 #define BILINEAR_DECLARE_VARIABLES \
michael@0 5560 const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
michael@0 5561 const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
michael@0 5562 const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\
michael@0 5563 const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
michael@0 5564 const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\
michael@0 5565 const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
michael@0 5566 const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
michael@0 5567 unit_x, unit_x, unit_x, unit_x); \
michael@0 5568 const __m128i xmm_zero = _mm_setzero_si128 (); \
michael@0 5569 __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx)
michael@0 5570
michael@0 5571 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
michael@0 5572 do { \
michael@0 5573 __m128i xmm_wh, xmm_lo, xmm_hi, a; \
michael@0 5574 /* fetch 2x2 pixel block into sse2 registers */ \
michael@0 5575 __m128i tltr = _mm_loadl_epi64 ( \
michael@0 5576 (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
michael@0 5577 __m128i blbr = _mm_loadl_epi64 ( \
michael@0 5578 (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \
michael@0 5579 vx += unit_x; \
michael@0 5580 /* vertical interpolation */ \
michael@0 5581 a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \
michael@0 5582 xmm_wt), \
michael@0 5583 _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
michael@0 5584 xmm_wb)); \
michael@0 5585 if (BILINEAR_INTERPOLATION_BITS < 8) \
michael@0 5586 { \
michael@0 5587 /* calculate horizontal weights */ \
michael@0 5588 xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \
michael@0 5589 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
michael@0 5590 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
michael@0 5591 /* horizontal interpolation */ \
michael@0 5592 a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
michael@0 5593 a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
michael@0 5594 } \
michael@0 5595 else \
michael@0 5596 { \
michael@0 5597 /* calculate horizontal weights */ \
michael@0 5598 xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \
michael@0 5599 _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \
michael@0 5600 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
michael@0 5601 /* horizontal interpolation */ \
michael@0 5602 xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
michael@0 5603 xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
michael@0 5604 a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
michael@0 5605 _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
michael@0 5606 } \
michael@0 5607 /* shift and pack the result */ \
michael@0 5608 a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
michael@0 5609 a = _mm_packs_epi32 (a, a); \
michael@0 5610 a = _mm_packus_epi16 (a, a); \
michael@0 5611 pix = _mm_cvtsi128_si32 (a); \
michael@0 5612 } while (0)
michael@0 5613
michael@0 5614 #define BILINEAR_SKIP_ONE_PIXEL() \
michael@0 5615 do { \
michael@0 5616 vx += unit_x; \
michael@0 5617 xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
michael@0 5618 } while(0)
michael@0 5619
michael@0 5620 static force_inline void
michael@0 5621 scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
michael@0 5622 const uint32_t * mask,
michael@0 5623 const uint32_t * src_top,
michael@0 5624 const uint32_t * src_bottom,
michael@0 5625 int32_t w,
michael@0 5626 int wt,
michael@0 5627 int wb,
michael@0 5628 pixman_fixed_t vx,
michael@0 5629 pixman_fixed_t unit_x,
michael@0 5630 pixman_fixed_t max_vx,
michael@0 5631 pixman_bool_t zero_src)
michael@0 5632 {
michael@0 5633 BILINEAR_DECLARE_VARIABLES;
michael@0 5634 uint32_t pix1, pix2, pix3, pix4;
michael@0 5635
michael@0 5636 while ((w -= 4) >= 0)
michael@0 5637 {
michael@0 5638 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5639 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
michael@0 5640 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
michael@0 5641 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
michael@0 5642 *dst++ = pix1;
michael@0 5643 *dst++ = pix2;
michael@0 5644 *dst++ = pix3;
michael@0 5645 *dst++ = pix4;
michael@0 5646 }
michael@0 5647
michael@0 5648 if (w & 2)
michael@0 5649 {
michael@0 5650 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5651 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
michael@0 5652 *dst++ = pix1;
michael@0 5653 *dst++ = pix2;
michael@0 5654 }
michael@0 5655
michael@0 5656 if (w & 1)
michael@0 5657 {
michael@0 5658 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5659 *dst = pix1;
michael@0 5660 }
michael@0 5661
michael@0 5662 }
michael@0 5663
michael@0 5664 /* Add extra NULL argument to the existing bilinear fast paths to indicate
michael@0 5665 * that we don't need two-pass processing */
michael@0 5666
michael@0 5667 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
michael@0 5668 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
michael@0 5669 uint32_t, uint32_t, uint32_t,
michael@0 5670 COVER, FLAG_NONE)
michael@0 5671 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
michael@0 5672 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
michael@0 5673 uint32_t, uint32_t, uint32_t,
michael@0 5674 PAD, FLAG_NONE)
michael@0 5675 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
michael@0 5676 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
michael@0 5677 uint32_t, uint32_t, uint32_t,
michael@0 5678 NONE, FLAG_NONE)
michael@0 5679 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
michael@0 5680 scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL,
michael@0 5681 uint32_t, uint32_t, uint32_t,
michael@0 5682 NORMAL, FLAG_NONE)
michael@0 5683
michael@0 5684 static force_inline void
michael@0 5685 scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
michael@0 5686 const uint32_t * mask,
michael@0 5687 const uint32_t * src_top,
michael@0 5688 const uint32_t * src_bottom,
michael@0 5689 int32_t w,
michael@0 5690 int wt,
michael@0 5691 int wb,
michael@0 5692 pixman_fixed_t vx,
michael@0 5693 pixman_fixed_t unit_x,
michael@0 5694 pixman_fixed_t max_vx,
michael@0 5695 pixman_bool_t zero_src)
michael@0 5696 {
michael@0 5697 BILINEAR_DECLARE_VARIABLES;
michael@0 5698 uint32_t pix1, pix2, pix3, pix4;
michael@0 5699
michael@0 5700 while (w && ((uintptr_t)dst & 15))
michael@0 5701 {
michael@0 5702 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5703
michael@0 5704 if (pix1)
michael@0 5705 {
michael@0 5706 pix2 = *dst;
michael@0 5707 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
michael@0 5708 }
michael@0 5709
michael@0 5710 w--;
michael@0 5711 dst++;
michael@0 5712 }
michael@0 5713
michael@0 5714 while (w >= 4)
michael@0 5715 {
michael@0 5716 __m128i xmm_src;
michael@0 5717 __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
michael@0 5718 __m128i xmm_alpha_hi, xmm_alpha_lo;
michael@0 5719
michael@0 5720 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5721 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
michael@0 5722 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
michael@0 5723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
michael@0 5724
michael@0 5725 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
michael@0 5726
michael@0 5727 if (!is_zero (xmm_src))
michael@0 5728 {
michael@0 5729 if (is_opaque (xmm_src))
michael@0 5730 {
michael@0 5731 save_128_aligned ((__m128i *)dst, xmm_src);
michael@0 5732 }
michael@0 5733 else
michael@0 5734 {
michael@0 5735 __m128i xmm_dst = load_128_aligned ((__m128i *)dst);
michael@0 5736
michael@0 5737 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 5738 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5739
michael@0 5740 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 5741 over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 5742 &xmm_dst_lo, &xmm_dst_hi);
michael@0 5743
michael@0 5744 save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 5745 }
michael@0 5746 }
michael@0 5747
michael@0 5748 w -= 4;
michael@0 5749 dst += 4;
michael@0 5750 }
michael@0 5751
michael@0 5752 while (w)
michael@0 5753 {
michael@0 5754 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5755
michael@0 5756 if (pix1)
michael@0 5757 {
michael@0 5758 pix2 = *dst;
michael@0 5759 *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
michael@0 5760 }
michael@0 5761
michael@0 5762 w--;
michael@0 5763 dst++;
michael@0 5764 }
michael@0 5765 }
michael@0 5766
michael@0 5767 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
michael@0 5768 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
michael@0 5769 uint32_t, uint32_t, uint32_t,
michael@0 5770 COVER, FLAG_NONE)
michael@0 5771 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
michael@0 5772 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
michael@0 5773 uint32_t, uint32_t, uint32_t,
michael@0 5774 PAD, FLAG_NONE)
michael@0 5775 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
michael@0 5776 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
michael@0 5777 uint32_t, uint32_t, uint32_t,
michael@0 5778 NONE, FLAG_NONE)
michael@0 5779 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
michael@0 5780 scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL,
michael@0 5781 uint32_t, uint32_t, uint32_t,
michael@0 5782 NORMAL, FLAG_NONE)
michael@0 5783
michael@0 5784
michael@0 5785 /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented
michael@0 5786 as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */
michael@0 5787
michael@0 5788 void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width)
michael@0 5789 {
michael@0 5790 /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */
michael@0 5791 while (--width >= 0)
michael@0 5792 {
michael@0 5793 *dst = composite_over_8888_0565pixel (*src, *dst);
michael@0 5794 src++;
michael@0 5795 dst++;
michael@0 5796 }
michael@0 5797 }
michael@0 5798
michael@0 5799 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER,
michael@0 5800 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
michael@0 5801 uint32_t, uint32_t, uint16_t,
michael@0 5802 COVER, FLAG_NONE)
michael@0 5803 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER,
michael@0 5804 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
michael@0 5805 uint32_t, uint32_t, uint16_t,
michael@0 5806 PAD, FLAG_NONE)
michael@0 5807 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER,
michael@0 5808 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
michael@0 5809 uint32_t, uint32_t, uint16_t,
michael@0 5810 NONE, FLAG_NONE)
michael@0 5811 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER,
michael@0 5812 scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565,
michael@0 5813 uint32_t, uint32_t, uint16_t,
michael@0 5814 NORMAL, FLAG_NONE)
michael@0 5815
michael@0 5816 /*****************************/
michael@0 5817
michael@0 5818 static force_inline void
michael@0 5819 scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
michael@0 5820 const uint8_t * mask,
michael@0 5821 const uint32_t * src_top,
michael@0 5822 const uint32_t * src_bottom,
michael@0 5823 int32_t w,
michael@0 5824 int wt,
michael@0 5825 int wb,
michael@0 5826 pixman_fixed_t vx,
michael@0 5827 pixman_fixed_t unit_x,
michael@0 5828 pixman_fixed_t max_vx,
michael@0 5829 pixman_bool_t zero_src)
michael@0 5830 {
michael@0 5831 BILINEAR_DECLARE_VARIABLES;
michael@0 5832 uint32_t pix1, pix2, pix3, pix4;
michael@0 5833 uint32_t m;
michael@0 5834
michael@0 5835 while (w && ((uintptr_t)dst & 15))
michael@0 5836 {
michael@0 5837 uint32_t sa;
michael@0 5838
michael@0 5839 m = (uint32_t) *mask++;
michael@0 5840
michael@0 5841 if (m)
michael@0 5842 {
michael@0 5843 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5844 sa = pix1 >> 24;
michael@0 5845
michael@0 5846 if (sa == 0xff && m == 0xff)
michael@0 5847 {
michael@0 5848 *dst = pix1;
michael@0 5849 }
michael@0 5850 else
michael@0 5851 {
michael@0 5852 __m128i ms, md, ma, msa;
michael@0 5853
michael@0 5854 pix2 = *dst;
michael@0 5855 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
michael@0 5856 ms = unpack_32_1x128 (pix1);
michael@0 5857 md = unpack_32_1x128 (pix2);
michael@0 5858
michael@0 5859 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
michael@0 5860
michael@0 5861 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
michael@0 5862 }
michael@0 5863 }
michael@0 5864 else
michael@0 5865 {
michael@0 5866 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 5867 }
michael@0 5868
michael@0 5869 w--;
michael@0 5870 dst++;
michael@0 5871 }
michael@0 5872
michael@0 5873 while (w >= 4)
michael@0 5874 {
michael@0 5875 __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
michael@0 5876 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 5877 __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
michael@0 5878
michael@0 5879 m = *(uint32_t*)mask;
michael@0 5880
michael@0 5881 if (m)
michael@0 5882 {
michael@0 5883 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5884 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
michael@0 5885 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
michael@0 5886 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
michael@0 5887
michael@0 5888 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
michael@0 5889
michael@0 5890 if (m == 0xffffffff && is_opaque (xmm_src))
michael@0 5891 {
michael@0 5892 save_128_aligned ((__m128i *)dst, xmm_src);
michael@0 5893 }
michael@0 5894 else
michael@0 5895 {
michael@0 5896 xmm_dst = load_128_aligned ((__m128i *)dst);
michael@0 5897
michael@0 5898 xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
michael@0 5899
michael@0 5900 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 5901 unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
michael@0 5902 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5903
michael@0 5904 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
michael@0 5905 expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
michael@0 5906
michael@0 5907 in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
michael@0 5908 &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
michael@0 5909
michael@0 5910 save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 5911 }
michael@0 5912 }
michael@0 5913 else
michael@0 5914 {
michael@0 5915 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 5916 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 5917 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 5918 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 5919 }
michael@0 5920
michael@0 5921 w -= 4;
michael@0 5922 dst += 4;
michael@0 5923 mask += 4;
michael@0 5924 }
michael@0 5925
michael@0 5926 while (w)
michael@0 5927 {
michael@0 5928 uint32_t sa;
michael@0 5929
michael@0 5930 m = (uint32_t) *mask++;
michael@0 5931
michael@0 5932 if (m)
michael@0 5933 {
michael@0 5934 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 5935 sa = pix1 >> 24;
michael@0 5936
michael@0 5937 if (sa == 0xff && m == 0xff)
michael@0 5938 {
michael@0 5939 *dst = pix1;
michael@0 5940 }
michael@0 5941 else
michael@0 5942 {
michael@0 5943 __m128i ms, md, ma, msa;
michael@0 5944
michael@0 5945 pix2 = *dst;
michael@0 5946 ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
michael@0 5947 ms = unpack_32_1x128 (pix1);
michael@0 5948 md = unpack_32_1x128 (pix2);
michael@0 5949
michael@0 5950 msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
michael@0 5951
michael@0 5952 *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
michael@0 5953 }
michael@0 5954 }
michael@0 5955 else
michael@0 5956 {
michael@0 5957 BILINEAR_SKIP_ONE_PIXEL ();
michael@0 5958 }
michael@0 5959
michael@0 5960 w--;
michael@0 5961 dst++;
michael@0 5962 }
michael@0 5963 }
michael@0 5964
michael@0 5965 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
michael@0 5966 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
michael@0 5967 uint32_t, uint8_t, uint32_t,
michael@0 5968 COVER, FLAG_HAVE_NON_SOLID_MASK)
michael@0 5969 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
michael@0 5970 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
michael@0 5971 uint32_t, uint8_t, uint32_t,
michael@0 5972 PAD, FLAG_HAVE_NON_SOLID_MASK)
michael@0 5973 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
michael@0 5974 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
michael@0 5975 uint32_t, uint8_t, uint32_t,
michael@0 5976 NONE, FLAG_HAVE_NON_SOLID_MASK)
michael@0 5977 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
michael@0 5978 scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL,
michael@0 5979 uint32_t, uint8_t, uint32_t,
michael@0 5980 NORMAL, FLAG_HAVE_NON_SOLID_MASK)
michael@0 5981
michael@0 5982 static force_inline void
michael@0 5983 scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
michael@0 5984 const uint32_t * mask,
michael@0 5985 const uint32_t * src_top,
michael@0 5986 const uint32_t * src_bottom,
michael@0 5987 int32_t w,
michael@0 5988 int wt,
michael@0 5989 int wb,
michael@0 5990 pixman_fixed_t vx,
michael@0 5991 pixman_fixed_t unit_x,
michael@0 5992 pixman_fixed_t max_vx,
michael@0 5993 pixman_bool_t zero_src)
michael@0 5994 {
michael@0 5995 BILINEAR_DECLARE_VARIABLES;
michael@0 5996 uint32_t pix1, pix2, pix3, pix4;
michael@0 5997 __m128i xmm_mask;
michael@0 5998
michael@0 5999 if (zero_src || (*mask >> 24) == 0)
michael@0 6000 return;
michael@0 6001
michael@0 6002 xmm_mask = create_mask_16_128 (*mask >> 24);
michael@0 6003
michael@0 6004 while (w && ((uintptr_t)dst & 15))
michael@0 6005 {
michael@0 6006 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 6007 if (pix1)
michael@0 6008 {
michael@0 6009 uint32_t d = *dst;
michael@0 6010
michael@0 6011 __m128i ms = unpack_32_1x128 (pix1);
michael@0 6012 __m128i alpha = expand_alpha_1x128 (ms);
michael@0 6013 __m128i dest = xmm_mask;
michael@0 6014 __m128i alpha_dst = unpack_32_1x128 (d);
michael@0 6015
michael@0 6016 *dst = pack_1x128_32
michael@0 6017 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
michael@0 6018 }
michael@0 6019
michael@0 6020 dst++;
michael@0 6021 w--;
michael@0 6022 }
michael@0 6023
michael@0 6024 while (w >= 4)
michael@0 6025 {
michael@0 6026 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 6027 BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
michael@0 6028 BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
michael@0 6029 BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
michael@0 6030
michael@0 6031 if (pix1 | pix2 | pix3 | pix4)
michael@0 6032 {
michael@0 6033 __m128i xmm_src, xmm_src_lo, xmm_src_hi;
michael@0 6034 __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
michael@0 6035 __m128i xmm_alpha_lo, xmm_alpha_hi;
michael@0 6036
michael@0 6037 xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
michael@0 6038
michael@0 6039 xmm_dst = load_128_aligned ((__m128i*)dst);
michael@0 6040
michael@0 6041 unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
michael@0 6042 unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
michael@0 6043 expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
michael@0 6044 &xmm_alpha_lo, &xmm_alpha_hi);
michael@0 6045
michael@0 6046 in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
michael@0 6047 &xmm_alpha_lo, &xmm_alpha_hi,
michael@0 6048 &xmm_mask, &xmm_mask,
michael@0 6049 &xmm_dst_lo, &xmm_dst_hi);
michael@0 6050
michael@0 6051 save_128_aligned
michael@0 6052 ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
michael@0 6053 }
michael@0 6054
michael@0 6055 dst += 4;
michael@0 6056 w -= 4;
michael@0 6057 }
michael@0 6058
michael@0 6059 while (w)
michael@0 6060 {
michael@0 6061 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
michael@0 6062 if (pix1)
michael@0 6063 {
michael@0 6064 uint32_t d = *dst;
michael@0 6065
michael@0 6066 __m128i ms = unpack_32_1x128 (pix1);
michael@0 6067 __m128i alpha = expand_alpha_1x128 (ms);
michael@0 6068 __m128i dest = xmm_mask;
michael@0 6069 __m128i alpha_dst = unpack_32_1x128 (d);
michael@0 6070
michael@0 6071 *dst = pack_1x128_32
michael@0 6072 (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
michael@0 6073 }
michael@0 6074
michael@0 6075 dst++;
michael@0 6076 w--;
michael@0 6077 }
michael@0 6078 }
michael@0 6079
michael@0 6080 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
michael@0 6081 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
michael@0 6082 uint32_t, uint32_t, uint32_t,
michael@0 6083 COVER, FLAG_HAVE_SOLID_MASK)
michael@0 6084 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
michael@0 6085 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
michael@0 6086 uint32_t, uint32_t, uint32_t,
michael@0 6087 PAD, FLAG_HAVE_SOLID_MASK)
michael@0 6088 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
michael@0 6089 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
michael@0 6090 uint32_t, uint32_t, uint32_t,
michael@0 6091 NONE, FLAG_HAVE_SOLID_MASK)
michael@0 6092 FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
michael@0 6093 scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL,
michael@0 6094 uint32_t, uint32_t, uint32_t,
michael@0 6095 NORMAL, FLAG_HAVE_SOLID_MASK)
michael@0 6096
michael@0 6097 static const pixman_fast_path_t sse2_fast_paths[] =
michael@0 6098 {
michael@0 6099 /* PIXMAN_OP_OVER */
michael@0 6100 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
michael@0 6101 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
michael@0 6102 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
michael@0 6103 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
michael@0 6104 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
michael@0 6105 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
michael@0 6106 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
michael@0 6107 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
michael@0 6108 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
michael@0 6109 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
michael@0 6110 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
michael@0 6111 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
michael@0 6112 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
michael@0 6113 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
michael@0 6114 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
michael@0 6115 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
michael@0 6116 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
michael@0 6117 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
michael@0 6118 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
michael@0 6119 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
michael@0 6120 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
michael@0 6121 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
michael@0 6122 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
michael@0 6123 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
michael@0 6124 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
michael@0 6125 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
michael@0 6126 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
michael@0 6127 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
michael@0 6128 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
michael@0 6129 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
michael@0 6130 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
michael@0 6131 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
michael@0 6132 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
michael@0 6133 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
michael@0 6134 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
michael@0 6135 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
michael@0 6136 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
michael@0 6137 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
michael@0 6138 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
michael@0 6139 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
michael@0 6140 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
michael@0 6141 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
michael@0 6142 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
michael@0 6143 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
michael@0 6144 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
michael@0 6145 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
michael@0 6146 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
michael@0 6147
michael@0 6148 /* PIXMAN_OP_OVER_REVERSE */
michael@0 6149 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
michael@0 6150 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
michael@0 6151
michael@0 6152 /* PIXMAN_OP_ADD */
michael@0 6153 PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
michael@0 6154 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
michael@0 6155 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
michael@0 6156 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
michael@0 6157 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
michael@0 6158 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
michael@0 6159 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
michael@0 6160 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
michael@0 6161 PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
michael@0 6162 PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
michael@0 6163 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
michael@0 6164 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
michael@0 6165 PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
michael@0 6166 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
michael@0 6167
michael@0 6168 /* PIXMAN_OP_SRC */
michael@0 6169 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
michael@0 6170 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
michael@0 6171 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
michael@0 6172 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
michael@0 6173 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
michael@0 6174 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
michael@0 6175 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
michael@0 6176 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
michael@0 6177 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
michael@0 6178 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
michael@0 6179 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
michael@0 6180 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
michael@0 6181 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
michael@0 6182 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
michael@0 6183 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
michael@0 6184 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
michael@0 6185 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
michael@0 6186 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
michael@0 6187
michael@0 6188 /* PIXMAN_OP_IN */
michael@0 6189 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
michael@0 6190 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
michael@0 6191 PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
michael@0 6192
michael@0 6193 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6194 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6195 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
michael@0 6196 SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
michael@0 6197 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6198 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6199 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
michael@0 6200 SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
michael@0 6201 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6202 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6203 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
michael@0 6204 SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
michael@0 6205 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6206 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6207 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
michael@0 6208 SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
michael@0 6209
michael@0 6210 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
michael@0 6211 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
michael@0 6212 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
michael@0 6213 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
michael@0 6214 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
michael@0 6215 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
michael@0 6216 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
michael@0 6217 SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
michael@0 6218
michael@0 6219 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
michael@0 6220 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6221 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6222 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
michael@0 6223 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6224 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6225
michael@0 6226 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
michael@0 6227 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
michael@0 6228 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
michael@0 6229 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
michael@0 6230
michael@0 6231 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
michael@0 6232 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
michael@0 6233 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
michael@0 6234 SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
michael@0 6235
michael@0 6236 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
michael@0 6237 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
michael@0 6238 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
michael@0 6239 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
michael@0 6240
michael@0 6241 /* and here the needed entries are added to the fast path table */
michael@0 6242
michael@0 6243 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565),
michael@0 6244 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565),
michael@0 6245
michael@0 6246 { PIXMAN_OP_NONE },
michael@0 6247 };
michael@0 6248
michael@0 6249 static uint32_t *
michael@0 6250 sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
michael@0 6251 {
michael@0 6252 int w = iter->width;
michael@0 6253 __m128i ff000000 = mask_ff000000;
michael@0 6254 uint32_t *dst = iter->buffer;
michael@0 6255 uint32_t *src = (uint32_t *)iter->bits;
michael@0 6256
michael@0 6257 iter->bits += iter->stride;
michael@0 6258
michael@0 6259 while (w && ((uintptr_t)dst) & 0x0f)
michael@0 6260 {
michael@0 6261 *dst++ = (*src++) | 0xff000000;
michael@0 6262 w--;
michael@0 6263 }
michael@0 6264
michael@0 6265 while (w >= 4)
michael@0 6266 {
michael@0 6267 save_128_aligned (
michael@0 6268 (__m128i *)dst, _mm_or_si128 (
michael@0 6269 load_128_unaligned ((__m128i *)src), ff000000));
michael@0 6270
michael@0 6271 dst += 4;
michael@0 6272 src += 4;
michael@0 6273 w -= 4;
michael@0 6274 }
michael@0 6275
michael@0 6276 while (w)
michael@0 6277 {
michael@0 6278 *dst++ = (*src++) | 0xff000000;
michael@0 6279 w--;
michael@0 6280 }
michael@0 6281
michael@0 6282 return iter->buffer;
michael@0 6283 }
michael@0 6284
michael@0 6285 static uint32_t *
michael@0 6286 sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
michael@0 6287 {
michael@0 6288 int w = iter->width;
michael@0 6289 uint32_t *dst = iter->buffer;
michael@0 6290 uint16_t *src = (uint16_t *)iter->bits;
michael@0 6291 __m128i ff000000 = mask_ff000000;
michael@0 6292
michael@0 6293 iter->bits += iter->stride;
michael@0 6294
michael@0 6295 while (w && ((uintptr_t)dst) & 0x0f)
michael@0 6296 {
michael@0 6297 uint16_t s = *src++;
michael@0 6298
michael@0 6299 *dst++ = convert_0565_to_8888 (s);
michael@0 6300 w--;
michael@0 6301 }
michael@0 6302
michael@0 6303 while (w >= 8)
michael@0 6304 {
michael@0 6305 __m128i lo, hi, s;
michael@0 6306
michael@0 6307 s = _mm_loadu_si128 ((__m128i *)src);
michael@0 6308
michael@0 6309 lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
michael@0 6310 hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
michael@0 6311
michael@0 6312 save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
michael@0 6313 save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
michael@0 6314
michael@0 6315 dst += 8;
michael@0 6316 src += 8;
michael@0 6317 w -= 8;
michael@0 6318 }
michael@0 6319
michael@0 6320 while (w)
michael@0 6321 {
michael@0 6322 uint16_t s = *src++;
michael@0 6323
michael@0 6324 *dst++ = convert_0565_to_8888 (s);
michael@0 6325 w--;
michael@0 6326 }
michael@0 6327
michael@0 6328 return iter->buffer;
michael@0 6329 }
michael@0 6330
michael@0 6331 static uint32_t *
michael@0 6332 sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
michael@0 6333 {
michael@0 6334 int w = iter->width;
michael@0 6335 uint32_t *dst = iter->buffer;
michael@0 6336 uint8_t *src = iter->bits;
michael@0 6337 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
michael@0 6338
michael@0 6339 iter->bits += iter->stride;
michael@0 6340
michael@0 6341 while (w && (((uintptr_t)dst) & 15))
michael@0 6342 {
michael@0 6343 *dst++ = *(src++) << 24;
michael@0 6344 w--;
michael@0 6345 }
michael@0 6346
michael@0 6347 while (w >= 16)
michael@0 6348 {
michael@0 6349 xmm0 = _mm_loadu_si128((__m128i *)src);
michael@0 6350
michael@0 6351 xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
michael@0 6352 xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
michael@0 6353 xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
michael@0 6354 xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
michael@0 6355 xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
michael@0 6356 xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
michael@0 6357
michael@0 6358 _mm_store_si128(((__m128i *)(dst + 0)), xmm3);
michael@0 6359 _mm_store_si128(((__m128i *)(dst + 4)), xmm4);
michael@0 6360 _mm_store_si128(((__m128i *)(dst + 8)), xmm5);
michael@0 6361 _mm_store_si128(((__m128i *)(dst + 12)), xmm6);
michael@0 6362
michael@0 6363 dst += 16;
michael@0 6364 src += 16;
michael@0 6365 w -= 16;
michael@0 6366 }
michael@0 6367
michael@0 6368 while (w)
michael@0 6369 {
michael@0 6370 *dst++ = *(src++) << 24;
michael@0 6371 w--;
michael@0 6372 }
michael@0 6373
michael@0 6374 return iter->buffer;
michael@0 6375 }
michael@0 6376
michael@0 6377 typedef struct
michael@0 6378 {
michael@0 6379 pixman_format_code_t format;
michael@0 6380 pixman_iter_get_scanline_t get_scanline;
michael@0 6381 } fetcher_info_t;
michael@0 6382
michael@0 6383 static const fetcher_info_t fetchers[] =
michael@0 6384 {
michael@0 6385 { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
michael@0 6386 { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
michael@0 6387 { PIXMAN_a8, sse2_fetch_a8 },
michael@0 6388 { PIXMAN_null }
michael@0 6389 };
michael@0 6390
michael@0 6391 static pixman_bool_t
michael@0 6392 sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
michael@0 6393 {
michael@0 6394 pixman_image_t *image = iter->image;
michael@0 6395
michael@0 6396 #define FLAGS \
michael@0 6397 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \
michael@0 6398 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
michael@0 6399
michael@0 6400 if ((iter->iter_flags & ITER_NARROW) &&
michael@0 6401 (iter->image_flags & FLAGS) == FLAGS)
michael@0 6402 {
michael@0 6403 const fetcher_info_t *f;
michael@0 6404
michael@0 6405 for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
michael@0 6406 {
michael@0 6407 if (image->common.extended_format_code == f->format)
michael@0 6408 {
michael@0 6409 uint8_t *b = (uint8_t *)image->bits.bits;
michael@0 6410 int s = image->bits.rowstride * 4;
michael@0 6411
michael@0 6412 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
michael@0 6413 iter->stride = s;
michael@0 6414
michael@0 6415 iter->get_scanline = f->get_scanline;
michael@0 6416 return TRUE;
michael@0 6417 }
michael@0 6418 }
michael@0 6419 }
michael@0 6420
michael@0 6421 return FALSE;
michael@0 6422 }
michael@0 6423
michael@0 6424 #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
michael@0 6425 __attribute__((__force_align_arg_pointer__))
michael@0 6426 #endif
michael@0 6427 pixman_implementation_t *
michael@0 6428 _pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
michael@0 6429 {
michael@0 6430 pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
michael@0 6431
michael@0 6432 /* SSE2 constants */
michael@0 6433 mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
michael@0 6434 mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
michael@0 6435 mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
michael@0 6436 mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
michael@0 6437 mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
michael@0 6438 mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
michael@0 6439 mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
michael@0 6440 mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
michael@0 6441 mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
michael@0 6442 mask_0080 = create_mask_16_128 (0x0080);
michael@0 6443 mask_00ff = create_mask_16_128 (0x00ff);
michael@0 6444 mask_0101 = create_mask_16_128 (0x0101);
michael@0 6445 mask_ffff = create_mask_16_128 (0xffff);
michael@0 6446 mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
michael@0 6447 mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
michael@0 6448 mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
michael@0 6449 mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
michael@0 6450
michael@0 6451 /* Set up function pointers */
michael@0 6452 imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
michael@0 6453 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
michael@0 6454 imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
michael@0 6455 imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
michael@0 6456 imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
michael@0 6457 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
michael@0 6458 imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
michael@0 6459 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
michael@0 6460 imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
michael@0 6461 imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
michael@0 6462
michael@0 6463 imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
michael@0 6464
michael@0 6465 imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
michael@0 6466 imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
michael@0 6467 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
michael@0 6468 imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
michael@0 6469 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
michael@0 6470 imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
michael@0 6471 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
michael@0 6472 imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
michael@0 6473 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
michael@0 6474 imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
michael@0 6475 imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
michael@0 6476
michael@0 6477 imp->blt = sse2_blt;
michael@0 6478 imp->fill = sse2_fill;
michael@0 6479
michael@0 6480 imp->src_iter_init = sse2_src_iter_init;
michael@0 6481
michael@0 6482 return imp;
michael@0 6483 }

mercurial