michael@0: /* michael@0: * Copyright © 2008 Rodrigo Kumpera michael@0: * Copyright © 2008 André Tupinambá michael@0: * michael@0: * Permission to use, copy, modify, distribute, and sell this software and its michael@0: * documentation for any purpose is hereby granted without fee, provided that michael@0: * the above copyright notice appear in all copies and that both that michael@0: * copyright notice and this permission notice appear in supporting michael@0: * documentation, and that the name of Red Hat not be used in advertising or michael@0: * publicity pertaining to distribution of the software without specific, michael@0: * written prior permission. Red Hat makes no representations about the michael@0: * suitability of this software for any purpose. It is provided "as is" michael@0: * without express or implied warranty. michael@0: * michael@0: * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS michael@0: * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND michael@0: * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY michael@0: * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES michael@0: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN michael@0: * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING michael@0: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS michael@0: * SOFTWARE. michael@0: * michael@0: * Author: Rodrigo Kumpera (kumpera@gmail.com) michael@0: * André Tupinambá (andrelrt@gmail.com) michael@0: * michael@0: * Based on work by Owen Taylor and Søren Sandmann michael@0: */ michael@0: #ifdef HAVE_CONFIG_H michael@0: #include michael@0: #endif michael@0: michael@0: #include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ michael@0: #include /* for SSE2 intrinsics */ michael@0: #include "pixman-private.h" michael@0: #include "pixman-combine32.h" michael@0: #include "pixman-inlines.h" michael@0: michael@0: static __m128i mask_0080; michael@0: static __m128i mask_00ff; michael@0: static __m128i mask_0101; michael@0: static __m128i mask_ffff; michael@0: static __m128i mask_ff000000; michael@0: static __m128i mask_alpha; michael@0: michael@0: static __m128i mask_565_r; michael@0: static __m128i mask_565_g1, mask_565_g2; michael@0: static __m128i mask_565_b; michael@0: static __m128i mask_red; michael@0: static __m128i mask_green; michael@0: static __m128i mask_blue; michael@0: michael@0: static __m128i mask_565_fix_rb; michael@0: static __m128i mask_565_fix_g; michael@0: michael@0: static __m128i mask_565_rb; michael@0: static __m128i mask_565_pack_multiplier; michael@0: michael@0: static force_inline __m128i michael@0: unpack_32_1x128 (uint32_t data) michael@0: { michael@0: return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); michael@0: } michael@0: michael@0: static force_inline void michael@0: unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) michael@0: { michael@0: *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); michael@0: *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: unpack_565_to_8888 (__m128i lo) michael@0: { michael@0: __m128i r, g, b, rb, t; michael@0: michael@0: r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); michael@0: g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); michael@0: b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); michael@0: michael@0: rb = _mm_or_si128 (r, b); michael@0: t = _mm_and_si128 (rb, mask_565_fix_rb); michael@0: t = _mm_srli_epi32 (t, 5); michael@0: rb = _mm_or_si128 (rb, t); michael@0: michael@0: t = _mm_and_si128 (g, mask_565_fix_g); michael@0: t = _mm_srli_epi32 (t, 6); michael@0: g = _mm_or_si128 (g, t); michael@0: michael@0: return _mm_or_si128 (rb, g); michael@0: } michael@0: michael@0: static force_inline void michael@0: unpack_565_128_4x128 (__m128i data, michael@0: __m128i* data0, michael@0: __m128i* data1, michael@0: __m128i* data2, michael@0: __m128i* data3) michael@0: { michael@0: __m128i lo, hi; michael@0: michael@0: lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); michael@0: hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); michael@0: michael@0: lo = unpack_565_to_8888 (lo); michael@0: hi = unpack_565_to_8888 (hi); michael@0: michael@0: unpack_128_2x128 (lo, data0, data1); michael@0: unpack_128_2x128 (hi, data2, data3); michael@0: } michael@0: michael@0: static force_inline uint16_t michael@0: pack_565_32_16 (uint32_t pixel) michael@0: { michael@0: return (uint16_t) (((pixel >> 8) & 0xf800) | michael@0: ((pixel >> 5) & 0x07e0) | michael@0: ((pixel >> 3) & 0x001f)); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: pack_2x128_128 (__m128i lo, __m128i hi) michael@0: { michael@0: return _mm_packus_epi16 (lo, hi); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: pack_565_2packedx128_128 (__m128i lo, __m128i hi) michael@0: { michael@0: __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); michael@0: __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); michael@0: michael@0: __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); michael@0: __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); michael@0: michael@0: __m128i g0 = _mm_and_si128 (lo, mask_green); michael@0: __m128i g1 = _mm_and_si128 (hi, mask_green); michael@0: michael@0: t0 = _mm_or_si128 (t0, g0); michael@0: t1 = _mm_or_si128 (t1, g1); michael@0: michael@0: /* Simulates _mm_packus_epi32 */ michael@0: t0 = _mm_slli_epi32 (t0, 16 - 5); michael@0: t1 = _mm_slli_epi32 (t1, 16 - 5); michael@0: t0 = _mm_srai_epi32 (t0, 16); michael@0: t1 = _mm_srai_epi32 (t1, 16); michael@0: return _mm_packs_epi32 (t0, t1); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: pack_565_2x128_128 (__m128i lo, __m128i hi) michael@0: { michael@0: __m128i data; michael@0: __m128i r, g1, g2, b; michael@0: michael@0: data = pack_2x128_128 (lo, hi); michael@0: michael@0: r = _mm_and_si128 (data, mask_565_r); michael@0: g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); michael@0: g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); michael@0: b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); michael@0: michael@0: return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) michael@0: { michael@0: return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), michael@0: pack_565_2x128_128 (*xmm2, *xmm3)); michael@0: } michael@0: michael@0: static force_inline int michael@0: is_opaque (__m128i x) michael@0: { michael@0: __m128i ffs = _mm_cmpeq_epi8 (x, x); michael@0: michael@0: return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; michael@0: } michael@0: michael@0: static force_inline int michael@0: is_zero (__m128i x) michael@0: { michael@0: return _mm_movemask_epi8 ( michael@0: _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; michael@0: } michael@0: michael@0: static force_inline int michael@0: is_transparent (__m128i x) michael@0: { michael@0: return (_mm_movemask_epi8 ( michael@0: _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: expand_pixel_32_1x128 (uint32_t data) michael@0: { michael@0: return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: expand_alpha_1x128 (__m128i data) michael@0: { michael@0: return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, michael@0: _MM_SHUFFLE (3, 3, 3, 3)), michael@0: _MM_SHUFFLE (3, 3, 3, 3)); michael@0: } michael@0: michael@0: static force_inline void michael@0: expand_alpha_2x128 (__m128i data_lo, michael@0: __m128i data_hi, michael@0: __m128i* alpha_lo, michael@0: __m128i* alpha_hi) michael@0: { michael@0: __m128i lo, hi; michael@0: michael@0: lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); michael@0: hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); michael@0: michael@0: *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); michael@0: *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); michael@0: } michael@0: michael@0: static force_inline void michael@0: expand_alpha_rev_2x128 (__m128i data_lo, michael@0: __m128i data_hi, michael@0: __m128i* alpha_lo, michael@0: __m128i* alpha_hi) michael@0: { michael@0: __m128i lo, hi; michael@0: michael@0: lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); michael@0: hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); michael@0: *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); michael@0: *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); michael@0: } michael@0: michael@0: static force_inline void michael@0: pix_multiply_2x128 (__m128i* data_lo, michael@0: __m128i* data_hi, michael@0: __m128i* alpha_lo, michael@0: __m128i* alpha_hi, michael@0: __m128i* ret_lo, michael@0: __m128i* ret_hi) michael@0: { michael@0: __m128i lo, hi; michael@0: michael@0: lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); michael@0: hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); michael@0: lo = _mm_adds_epu16 (lo, mask_0080); michael@0: hi = _mm_adds_epu16 (hi, mask_0080); michael@0: *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); michael@0: *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); michael@0: } michael@0: michael@0: static force_inline void michael@0: pix_add_multiply_2x128 (__m128i* src_lo, michael@0: __m128i* src_hi, michael@0: __m128i* alpha_dst_lo, michael@0: __m128i* alpha_dst_hi, michael@0: __m128i* dst_lo, michael@0: __m128i* dst_hi, michael@0: __m128i* alpha_src_lo, michael@0: __m128i* alpha_src_hi, michael@0: __m128i* ret_lo, michael@0: __m128i* ret_hi) michael@0: { michael@0: __m128i t1_lo, t1_hi; michael@0: __m128i t2_lo, t2_hi; michael@0: michael@0: pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); michael@0: pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); michael@0: michael@0: *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); michael@0: *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); michael@0: } michael@0: michael@0: static force_inline void michael@0: negate_2x128 (__m128i data_lo, michael@0: __m128i data_hi, michael@0: __m128i* neg_lo, michael@0: __m128i* neg_hi) michael@0: { michael@0: *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); michael@0: *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); michael@0: } michael@0: michael@0: static force_inline void michael@0: invert_colors_2x128 (__m128i data_lo, michael@0: __m128i data_hi, michael@0: __m128i* inv_lo, michael@0: __m128i* inv_hi) michael@0: { michael@0: __m128i lo, hi; michael@0: michael@0: lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); michael@0: hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); michael@0: *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); michael@0: *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); michael@0: } michael@0: michael@0: static force_inline void michael@0: over_2x128 (__m128i* src_lo, michael@0: __m128i* src_hi, michael@0: __m128i* alpha_lo, michael@0: __m128i* alpha_hi, michael@0: __m128i* dst_lo, michael@0: __m128i* dst_hi) michael@0: { michael@0: __m128i t1, t2; michael@0: michael@0: negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); michael@0: michael@0: pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); michael@0: michael@0: *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); michael@0: *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); michael@0: } michael@0: michael@0: static force_inline void michael@0: over_rev_non_pre_2x128 (__m128i src_lo, michael@0: __m128i src_hi, michael@0: __m128i* dst_lo, michael@0: __m128i* dst_hi) michael@0: { michael@0: __m128i lo, hi; michael@0: __m128i alpha_lo, alpha_hi; michael@0: michael@0: expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); michael@0: michael@0: lo = _mm_or_si128 (alpha_lo, mask_alpha); michael@0: hi = _mm_or_si128 (alpha_hi, mask_alpha); michael@0: michael@0: invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); michael@0: michael@0: pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); michael@0: michael@0: over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); michael@0: } michael@0: michael@0: static force_inline void michael@0: in_over_2x128 (__m128i* src_lo, michael@0: __m128i* src_hi, michael@0: __m128i* alpha_lo, michael@0: __m128i* alpha_hi, michael@0: __m128i* mask_lo, michael@0: __m128i* mask_hi, michael@0: __m128i* dst_lo, michael@0: __m128i* dst_hi) michael@0: { michael@0: __m128i s_lo, s_hi; michael@0: __m128i a_lo, a_hi; michael@0: michael@0: pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); michael@0: pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); michael@0: michael@0: over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); michael@0: } michael@0: michael@0: /* load 4 pixels from a 16-byte boundary aligned address */ michael@0: static force_inline __m128i michael@0: load_128_aligned (__m128i* src) michael@0: { michael@0: return _mm_load_si128 (src); michael@0: } michael@0: michael@0: /* load 4 pixels from a unaligned address */ michael@0: static force_inline __m128i michael@0: load_128_unaligned (const __m128i* src) michael@0: { michael@0: return _mm_loadu_si128 (src); michael@0: } michael@0: michael@0: /* save 4 pixels using Write Combining memory on a 16-byte michael@0: * boundary aligned address michael@0: */ michael@0: static force_inline void michael@0: save_128_write_combining (__m128i* dst, michael@0: __m128i data) michael@0: { michael@0: _mm_stream_si128 (dst, data); michael@0: } michael@0: michael@0: /* save 4 pixels on a 16-byte boundary aligned address */ michael@0: static force_inline void michael@0: save_128_aligned (__m128i* dst, michael@0: __m128i data) michael@0: { michael@0: _mm_store_si128 (dst, data); michael@0: } michael@0: michael@0: /* save 4 pixels on a unaligned address */ michael@0: static force_inline void michael@0: save_128_unaligned (__m128i* dst, michael@0: __m128i data) michael@0: { michael@0: _mm_storeu_si128 (dst, data); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: load_32_1x128 (uint32_t data) michael@0: { michael@0: return _mm_cvtsi32_si128 (data); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: expand_alpha_rev_1x128 (__m128i data) michael@0: { michael@0: return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: expand_pixel_8_1x128 (uint8_t data) michael@0: { michael@0: return _mm_shufflelo_epi16 ( michael@0: unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: pix_multiply_1x128 (__m128i data, michael@0: __m128i alpha) michael@0: { michael@0: return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), michael@0: mask_0080), michael@0: mask_0101); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: pix_add_multiply_1x128 (__m128i* src, michael@0: __m128i* alpha_dst, michael@0: __m128i* dst, michael@0: __m128i* alpha_src) michael@0: { michael@0: __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); michael@0: __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); michael@0: michael@0: return _mm_adds_epu8 (t1, t2); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: negate_1x128 (__m128i data) michael@0: { michael@0: return _mm_xor_si128 (data, mask_00ff); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: invert_colors_1x128 (__m128i data) michael@0: { michael@0: return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: over_1x128 (__m128i src, __m128i alpha, __m128i dst) michael@0: { michael@0: return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) michael@0: { michael@0: return over_1x128 (pix_multiply_1x128 (*src, *mask), michael@0: pix_multiply_1x128 (*alpha, *mask), michael@0: *dst); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: over_rev_non_pre_1x128 (__m128i src, __m128i dst) michael@0: { michael@0: __m128i alpha = expand_alpha_1x128 (src); michael@0: michael@0: return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), michael@0: _mm_or_si128 (alpha, mask_alpha)), michael@0: alpha, michael@0: dst); michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: pack_1x128_32 (__m128i data) michael@0: { michael@0: return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: expand565_16_1x128 (uint16_t pixel) michael@0: { michael@0: __m128i m = _mm_cvtsi32_si128 (pixel); michael@0: michael@0: m = unpack_565_to_8888 (m); michael@0: michael@0: return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) michael@0: { michael@0: uint8_t a; michael@0: __m128i xmms; michael@0: michael@0: a = src >> 24; michael@0: michael@0: if (a == 0xff) michael@0: { michael@0: return src; michael@0: } michael@0: else if (src) michael@0: { michael@0: xmms = unpack_32_1x128 (src); michael@0: return pack_1x128_32 ( michael@0: over_1x128 (xmms, expand_alpha_1x128 (xmms), michael@0: unpack_32_1x128 (dst))); michael@0: } michael@0: michael@0: return dst; michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: combine1 (const uint32_t *ps, const uint32_t *pm) michael@0: { michael@0: uint32_t s = *ps; michael@0: michael@0: if (pm) michael@0: { michael@0: __m128i ms, mm; michael@0: michael@0: mm = unpack_32_1x128 (*pm); michael@0: mm = expand_alpha_1x128 (mm); michael@0: michael@0: ms = unpack_32_1x128 (s); michael@0: ms = pix_multiply_1x128 (ms, mm); michael@0: michael@0: s = pack_1x128_32 (ms); michael@0: } michael@0: michael@0: return s; michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: combine4 (const __m128i *ps, const __m128i *pm) michael@0: { michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_msk_lo, xmm_msk_hi; michael@0: __m128i s; michael@0: michael@0: if (pm) michael@0: { michael@0: xmm_msk_lo = load_128_unaligned (pm); michael@0: michael@0: if (is_transparent (xmm_msk_lo)) michael@0: return _mm_setzero_si128 (); michael@0: } michael@0: michael@0: s = load_128_unaligned (ps); michael@0: michael@0: if (pm) michael@0: { michael@0: unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_msk_lo, &xmm_msk_hi, michael@0: &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); michael@0: } michael@0: michael@0: return s; michael@0: } michael@0: michael@0: static force_inline void michael@0: core_combine_over_u_sse2_mask (uint32_t * pd, michael@0: const uint32_t* ps, michael@0: const uint32_t* pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: /* Align dst on a 16-byte boundary */ michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: d = *pd; michael@0: s = combine1 (ps, pm); michael@0: michael@0: if (s) michael@0: *pd = core_combine_over_u_pixel_sse2 (s, d); michael@0: pd++; michael@0: ps++; michael@0: pm++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i mask = load_128_unaligned ((__m128i *)pm); michael@0: michael@0: if (!is_zero (mask)) michael@0: { michael@0: __m128i src; michael@0: __m128i src_hi, src_lo; michael@0: __m128i mask_hi, mask_lo; michael@0: __m128i alpha_hi, alpha_lo; michael@0: michael@0: src = load_128_unaligned ((__m128i *)ps); michael@0: michael@0: if (is_opaque (_mm_and_si128 (src, mask))) michael@0: { michael@0: save_128_aligned ((__m128i *)pd, src); michael@0: } michael@0: else michael@0: { michael@0: __m128i dst = load_128_aligned ((__m128i *)pd); michael@0: __m128i dst_hi, dst_lo; michael@0: michael@0: unpack_128_2x128 (mask, &mask_lo, &mask_hi); michael@0: unpack_128_2x128 (src, &src_lo, &src_hi); michael@0: michael@0: expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); michael@0: pix_multiply_2x128 (&src_lo, &src_hi, michael@0: &mask_lo, &mask_hi, michael@0: &src_lo, &src_hi); michael@0: michael@0: unpack_128_2x128 (dst, &dst_lo, &dst_hi); michael@0: michael@0: expand_alpha_2x128 (src_lo, src_hi, michael@0: &alpha_lo, &alpha_hi); michael@0: michael@0: over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, michael@0: &dst_lo, &dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i *)pd, michael@0: pack_2x128_128 (dst_lo, dst_hi)); michael@0: } michael@0: } michael@0: michael@0: pm += 4; michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: } michael@0: while (w) michael@0: { michael@0: d = *pd; michael@0: s = combine1 (ps, pm); michael@0: michael@0: if (s) michael@0: *pd = core_combine_over_u_pixel_sse2 (s, d); michael@0: pd++; michael@0: ps++; michael@0: pm++; michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline void michael@0: core_combine_over_u_sse2_no_mask (uint32_t * pd, michael@0: const uint32_t* ps, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: /* Align dst on a 16-byte boundary */ michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: d = *pd; michael@0: s = *ps; michael@0: michael@0: if (s) michael@0: *pd = core_combine_over_u_pixel_sse2 (s, d); michael@0: pd++; michael@0: ps++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i src; michael@0: __m128i src_hi, src_lo, dst_hi, dst_lo; michael@0: __m128i alpha_hi, alpha_lo; michael@0: michael@0: src = load_128_unaligned ((__m128i *)ps); michael@0: michael@0: if (!is_zero (src)) michael@0: { michael@0: if (is_opaque (src)) michael@0: { michael@0: save_128_aligned ((__m128i *)pd, src); michael@0: } michael@0: else michael@0: { michael@0: __m128i dst = load_128_aligned ((__m128i *)pd); michael@0: michael@0: unpack_128_2x128 (src, &src_lo, &src_hi); michael@0: unpack_128_2x128 (dst, &dst_lo, &dst_hi); michael@0: michael@0: expand_alpha_2x128 (src_lo, src_hi, michael@0: &alpha_lo, &alpha_hi); michael@0: over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, michael@0: &dst_lo, &dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i *)pd, michael@0: pack_2x128_128 (dst_lo, dst_hi)); michael@0: } michael@0: } michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: } michael@0: while (w) michael@0: { michael@0: d = *pd; michael@0: s = *ps; michael@0: michael@0: if (s) michael@0: *pd = core_combine_over_u_pixel_sse2 (s, d); michael@0: pd++; michael@0: ps++; michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline void michael@0: sse2_combine_over_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: if (pm) michael@0: core_combine_over_u_sse2_mask (pd, ps, pm, w); michael@0: else michael@0: core_combine_over_u_sse2_no_mask (pd, ps, w); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_over_reverse_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: michael@0: /* Align dst on a 16-byte boundary */ michael@0: while (w && michael@0: ((uintptr_t)pd & 15)) michael@0: { michael@0: d = *pd; michael@0: s = combine1 (ps, pm); michael@0: michael@0: *pd++ = core_combine_over_u_pixel_sse2 (d, s); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: /* I'm loading unaligned because I'm not sure michael@0: * about the address alignment. michael@0: */ michael@0: xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: over_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: /* rebuid the 4 pixel data and save*/ michael@0: save_128_aligned ((__m128i*)pd, michael@0: pack_2x128_128 (xmm_src_lo, xmm_src_hi)); michael@0: michael@0: w -= 4; michael@0: ps += 4; michael@0: pd += 4; michael@0: michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: d = *pd; michael@0: s = combine1 (ps, pm); michael@0: michael@0: *pd++ = core_combine_over_u_pixel_sse2 (d, s); michael@0: ps++; michael@0: w--; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) michael@0: { michael@0: uint32_t maska = src >> 24; michael@0: michael@0: if (maska == 0) michael@0: { michael@0: return 0; michael@0: } michael@0: else if (maska != 0xff) michael@0: { michael@0: return pack_1x128_32 ( michael@0: pix_multiply_1x128 (unpack_32_1x128 (dst), michael@0: expand_alpha_1x128 (unpack_32_1x128 (src)))); michael@0: } michael@0: michael@0: return dst; michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_in_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_in_u_pixel_sse2 (d, s); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ((__m128i*)pd, michael@0: pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_in_u_pixel_sse2 (d, s); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_in_reverse_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_in_u_pixel_sse2 (s, d); michael@0: ps++; michael@0: w--; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_src_lo, &xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_in_u_pixel_sse2 (s, d); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_out_reverse_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: uint32_t s = combine1 (ps, pm); michael@0: uint32_t d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (d), negate_1x128 ( michael@0: expand_alpha_1x128 (unpack_32_1x128 (s))))); michael@0: michael@0: if (pm) michael@0: pm++; michael@0: ps++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_src_lo, &xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: if (pm) michael@0: pm += 4; michael@0: michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t s = combine1 (ps, pm); michael@0: uint32_t d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (d), negate_1x128 ( michael@0: expand_alpha_1x128 (unpack_32_1x128 (s))))); michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_out_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: uint32_t s = combine1 (ps, pm); michael@0: uint32_t d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (s), negate_1x128 ( michael@0: expand_alpha_1x128 (unpack_32_1x128 (d))))); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t s = combine1 (ps, pm); michael@0: uint32_t d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (s), negate_1x128 ( michael@0: expand_alpha_1x128 (unpack_32_1x128 (d))))); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_atop_u_pixel_sse2 (uint32_t src, michael@0: uint32_t dst) michael@0: { michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: michael@0: __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); michael@0: __m128i da = expand_alpha_1x128 (d); michael@0: michael@0: return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_atop_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; michael@0: __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; michael@0: michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_atop_u_pixel_sse2 (s, d); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: michael@0: pix_add_multiply_2x128 ( michael@0: &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_atop_u_pixel_sse2 (s, d); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, michael@0: uint32_t dst) michael@0: { michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: michael@0: __m128i sa = expand_alpha_1x128 (s); michael@0: __m128i da = negate_1x128 (expand_alpha_1x128 (d)); michael@0: michael@0: return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_atop_reverse_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; michael@0: __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; michael@0: michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); michael@0: ps++; michael@0: w--; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: pix_add_multiply_2x128 ( michael@0: &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); michael@0: ps++; michael@0: w--; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_xor_u_pixel_sse2 (uint32_t src, michael@0: uint32_t dst) michael@0: { michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: michael@0: __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); michael@0: __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); michael@0: michael@0: return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_xor_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * dst, michael@0: const uint32_t * src, michael@0: const uint32_t * mask, michael@0: int width) michael@0: { michael@0: int w = width; michael@0: uint32_t s, d; michael@0: uint32_t* pd = dst; michael@0: const uint32_t* ps = src; michael@0: const uint32_t* pm = mask; michael@0: michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; michael@0: __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; michael@0: michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_xor_u_pixel_sse2 (s, d); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); michael@0: xmm_dst = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: pix_add_multiply_2x128 ( michael@0: &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: w -= 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_xor_u_pixel_sse2 (s, d); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static force_inline void michael@0: sse2_combine_add_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * dst, michael@0: const uint32_t * src, michael@0: const uint32_t * mask, michael@0: int width) michael@0: { michael@0: int w = width; michael@0: uint32_t s, d; michael@0: uint32_t* pd = dst; michael@0: const uint32_t* ps = src; michael@0: const uint32_t* pm = mask; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: *pd++ = _mm_cvtsi128_si32 ( michael@0: _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i s; michael@0: michael@0: s = combine4 ((__m128i*)ps, (__m128i*)pm); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); michael@0: michael@0: pd += 4; michael@0: ps += 4; michael@0: if (pm) michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w--) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: ps++; michael@0: *pd++ = _mm_cvtsi128_si32 ( michael@0: _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_saturate_u_pixel_sse2 (uint32_t src, michael@0: uint32_t dst) michael@0: { michael@0: __m128i ms = unpack_32_1x128 (src); michael@0: __m128i md = unpack_32_1x128 (dst); michael@0: uint32_t sa = src >> 24; michael@0: uint32_t da = ~dst >> 24; michael@0: michael@0: if (sa > da) michael@0: { michael@0: ms = pix_multiply_1x128 ( michael@0: ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); michael@0: } michael@0: michael@0: return pack_1x128_32 (_mm_adds_epu16 (md, ms)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_saturate_u (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, d; michael@0: michael@0: uint32_t pack_cmp; michael@0: __m128i xmm_src, xmm_dst; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); michael@0: w--; michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)pd); michael@0: xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); michael@0: michael@0: pack_cmp = _mm_movemask_epi8 ( michael@0: _mm_cmpgt_epi32 ( michael@0: _mm_srli_epi32 (xmm_src, 24), michael@0: _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); michael@0: michael@0: /* if some alpha src is grater than respective ~alpha dst */ michael@0: if (pack_cmp) michael@0: { michael@0: s = combine1 (ps++, pm); michael@0: d = *pd; michael@0: *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); michael@0: if (pm) michael@0: pm++; michael@0: michael@0: s = combine1 (ps++, pm); michael@0: d = *pd; michael@0: *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); michael@0: if (pm) michael@0: pm++; michael@0: michael@0: s = combine1 (ps++, pm); michael@0: d = *pd; michael@0: *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); michael@0: if (pm) michael@0: pm++; michael@0: michael@0: s = combine1 (ps++, pm); michael@0: d = *pd; michael@0: *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: else michael@0: { michael@0: save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); michael@0: michael@0: pd += 4; michael@0: ps += 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w--) michael@0: { michael@0: s = combine1 (ps, pm); michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); michael@0: ps++; michael@0: if (pm) michael@0: pm++; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_src_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_over_ca_pixel_sse2 (uint32_t src, michael@0: uint32_t mask, michael@0: uint32_t dst) michael@0: { michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i expAlpha = expand_alpha_1x128 (s); michael@0: __m128i unpk_mask = unpack_32_1x128 (mask); michael@0: __m128i unpk_dst = unpack_32_1x128 (dst); michael@0: michael@0: return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_over_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, michael@0: uint32_t mask, michael@0: uint32_t dst) michael@0: { michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: michael@0: return pack_1x128_32 ( michael@0: over_1x128 (d, expand_alpha_1x128 (d), michael@0: pix_multiply_1x128 (unpack_32_1x128 (src), michael@0: unpack_32_1x128 (mask)))); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_over_reverse_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: over_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_in_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), michael@0: expand_alpha_1x128 (unpack_32_1x128 (d)))); michael@0: michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (s), unpack_32_1x128 (m)), michael@0: expand_alpha_1x128 (unpack_32_1x128 (d)))); michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_in_reverse_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (d), michael@0: pix_multiply_1x128 (unpack_32_1x128 (m), michael@0: expand_alpha_1x128 (unpack_32_1x128 (s))))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (d), michael@0: pix_multiply_1x128 (unpack_32_1x128 (m), michael@0: expand_alpha_1x128 (unpack_32_1x128 (s))))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_out_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (s), unpack_32_1x128 (m)), michael@0: negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (s), unpack_32_1x128 (m)), michael@0: negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_out_reverse_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (d), michael@0: negate_1x128 (pix_multiply_1x128 ( michael@0: unpack_32_1x128 (m), michael@0: expand_alpha_1x128 (unpack_32_1x128 (s)))))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: negate_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (d), michael@0: negate_1x128 (pix_multiply_1x128 ( michael@0: unpack_32_1x128 (m), michael@0: expand_alpha_1x128 (unpack_32_1x128 (s)))))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_atop_ca_pixel_sse2 (uint32_t src, michael@0: uint32_t mask, michael@0: uint32_t dst) michael@0: { michael@0: __m128i m = unpack_32_1x128 (mask); michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: __m128i sa = expand_alpha_1x128 (s); michael@0: __m128i da = expand_alpha_1x128 (d); michael@0: michael@0: s = pix_multiply_1x128 (s, m); michael@0: m = negate_1x128 (pix_multiply_1x128 (m, sa)); michael@0: michael@0: return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_atop_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; michael@0: __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi); michael@0: pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_add_multiply_2x128 ( michael@0: &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, michael@0: uint32_t mask, michael@0: uint32_t dst) michael@0: { michael@0: __m128i m = unpack_32_1x128 (mask); michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: michael@0: __m128i da = negate_1x128 (expand_alpha_1x128 (d)); michael@0: __m128i sa = expand_alpha_1x128 (s); michael@0: michael@0: s = pix_multiply_1x128 (s, m); michael@0: m = pix_multiply_1x128 (m, sa); michael@0: michael@0: return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; michael@0: __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi); michael@0: pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: pix_add_multiply_2x128 ( michael@0: &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint32_t michael@0: core_combine_xor_ca_pixel_sse2 (uint32_t src, michael@0: uint32_t mask, michael@0: uint32_t dst) michael@0: { michael@0: __m128i a = unpack_32_1x128 (mask); michael@0: __m128i s = unpack_32_1x128 (src); michael@0: __m128i d = unpack_32_1x128 (dst); michael@0: michael@0: __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( michael@0: a, expand_alpha_1x128 (s))); michael@0: __m128i dest = pix_multiply_1x128 (s, a); michael@0: __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); michael@0: michael@0: return pack_1x128_32 (pix_add_multiply_1x128 (&d, michael@0: &alpha_dst, michael@0: &dest, michael@0: &alpha_src)); michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_xor_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; michael@0: __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi); michael@0: pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_alpha_src_lo, &xmm_alpha_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, michael@0: &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); michael@0: negate_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_add_multiply_2x128 ( michael@0: &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_combine_add_ca (pixman_implementation_t *imp, michael@0: pixman_op_t op, michael@0: uint32_t * pd, michael@0: const uint32_t * ps, michael@0: const uint32_t * pm, michael@0: int w) michael@0: { michael@0: uint32_t s, m, d; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), michael@0: unpack_32_1x128 (m)), michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)ps); michael@0: xmm_mask_hi = load_128_unaligned ((__m128i*)pm); michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 ( michael@0: _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), michael@0: _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); michael@0: michael@0: ps += 4; michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *ps++; michael@0: m = *pm++; michael@0: d = *pd; michael@0: michael@0: *pd++ = pack_1x128_32 ( michael@0: _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), michael@0: unpack_32_1x128 (m)), michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: static force_inline __m128i michael@0: create_mask_16_128 (uint16_t mask) michael@0: { michael@0: return _mm_set1_epi16 (mask); michael@0: } michael@0: michael@0: /* Work around a code generation bug in Sun Studio 12. */ michael@0: #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) michael@0: # define create_mask_2x32_128(mask0, mask1) \ michael@0: (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) michael@0: #else michael@0: static force_inline __m128i michael@0: create_mask_2x32_128 (uint32_t mask0, michael@0: uint32_t mask1) michael@0: { michael@0: return _mm_set_epi32 (mask0, mask1, mask0, mask1); michael@0: } michael@0: #endif michael@0: michael@0: static void michael@0: sse2_composite_over_n_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint32_t *dst_line, *dst, d; michael@0: int32_t w; michael@0: int dst_stride; michael@0: __m128i xmm_src, xmm_alpha; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: xmm_alpha = expand_alpha_1x128 (xmm_src); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: michael@0: dst_line += dst_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: d = *dst; michael@0: *dst++ = pack_1x128_32 (over_1x128 (xmm_src, michael@0: xmm_alpha, michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: /* rebuid the 4 pixel data and save*/ michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: d = *dst; michael@0: *dst++ = pack_1x128_32 (over_1x128 (xmm_src, michael@0: xmm_alpha, michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_n_0565 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint16_t *dst_line, *dst, d; michael@0: int32_t w; michael@0: int dst_stride; michael@0: __m128i xmm_src, xmm_alpha; michael@0: __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); michael@0: michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: xmm_alpha = expand_alpha_1x128 (xmm_src); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: michael@0: dst_line += dst_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: d = *dst; michael@0: michael@0: *dst++ = pack_565_32_16 ( michael@0: pack_1x128_32 (over_1x128 (xmm_src, michael@0: xmm_alpha, michael@0: expand565_16_1x128 (d)))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 8) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_565_128_4x128 (xmm_dst, michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); michael@0: michael@0: over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_dst0, &xmm_dst1); michael@0: over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_dst2, &xmm_dst3); michael@0: michael@0: xmm_dst = pack_565_4x128_128 ( michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); michael@0: michael@0: save_128_aligned ((__m128i*)dst, xmm_dst); michael@0: michael@0: dst += 8; michael@0: w -= 8; michael@0: } michael@0: michael@0: while (w--) michael@0: { michael@0: d = *dst; michael@0: *dst++ = pack_565_32_16 ( michael@0: pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, michael@0: expand565_16_1x128 (d)))); michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint32_t *dst_line, d; michael@0: uint32_t *mask_line, m; michael@0: uint32_t pack_cmp; michael@0: int dst_stride, mask_stride; michael@0: michael@0: __m128i xmm_src; michael@0: __m128i xmm_dst; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: __m128i mmx_src, mmx_mask, mmx_dest; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); michael@0: michael@0: xmm_src = _mm_unpacklo_epi8 ( michael@0: create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); michael@0: mmx_src = xmm_src; michael@0: michael@0: while (height--) michael@0: { michael@0: int w = width; michael@0: const uint32_t *pm = (uint32_t *)mask_line; michael@0: uint32_t *pd = (uint32_t *)dst_line; michael@0: michael@0: dst_line += dst_stride; michael@0: mask_line += mask_stride; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: m = *pm++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *pd; michael@0: michael@0: mmx_mask = unpack_32_1x128 (m); michael@0: mmx_dest = unpack_32_1x128 (d); michael@0: michael@0: *pd = pack_1x128_32 ( michael@0: _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), michael@0: mmx_dest)); michael@0: } michael@0: michael@0: pd++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_mask = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: pack_cmp = michael@0: _mm_movemask_epi8 ( michael@0: _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); michael@0: michael@0: /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ michael@0: if (pack_cmp != 0xffff) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)pd); michael@0: michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); michael@0: } michael@0: michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = *pm++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *pd; michael@0: michael@0: mmx_mask = unpack_32_1x128 (m); michael@0: mmx_dest = unpack_32_1x128 (d); michael@0: michael@0: *pd = pack_1x128_32 ( michael@0: _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), michael@0: mmx_dest)); michael@0: } michael@0: michael@0: pd++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint32_t *dst_line, d; michael@0: uint32_t *mask_line, m; michael@0: uint32_t pack_cmp; michael@0: int dst_stride, mask_stride; michael@0: michael@0: __m128i xmm_src, xmm_alpha; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); michael@0: michael@0: xmm_src = _mm_unpacklo_epi8 ( michael@0: create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); michael@0: xmm_alpha = expand_alpha_1x128 (xmm_src); michael@0: mmx_src = xmm_src; michael@0: mmx_alpha = xmm_alpha; michael@0: michael@0: while (height--) michael@0: { michael@0: int w = width; michael@0: const uint32_t *pm = (uint32_t *)mask_line; michael@0: uint32_t *pd = (uint32_t *)dst_line; michael@0: michael@0: dst_line += dst_stride; michael@0: mask_line += mask_stride; michael@0: michael@0: while (w && (uintptr_t)pd & 15) michael@0: { michael@0: m = *pm++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *pd; michael@0: mmx_mask = unpack_32_1x128 (m); michael@0: mmx_dest = unpack_32_1x128 (d); michael@0: michael@0: *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, michael@0: &mmx_alpha, michael@0: &mmx_mask, michael@0: &mmx_dest)); michael@0: } michael@0: michael@0: pd++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_mask = load_128_unaligned ((__m128i*)pm); michael@0: michael@0: pack_cmp = michael@0: _mm_movemask_epi8 ( michael@0: _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); michael@0: michael@0: /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ michael@0: if (pack_cmp != 0xffff) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)pd); michael@0: michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: in_over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: pd += 4; michael@0: pm += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = *pm++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *pd; michael@0: mmx_mask = unpack_32_1x128 (m); michael@0: mmx_dest = unpack_32_1x128 (d); michael@0: michael@0: *pd = pack_1x128_32 ( michael@0: in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); michael@0: } michael@0: michael@0: pd++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst; michael@0: uint32_t *src_line, *src; michael@0: uint32_t mask; michael@0: int32_t w; michael@0: int dst_stride, src_stride; michael@0: michael@0: __m128i xmm_mask; michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); michael@0: michael@0: xmm_mask = create_mask_16_128 (mask >> 24); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint32_t s = *src++; michael@0: michael@0: if (s) michael@0: { michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i ms = unpack_32_1x128 (s); michael@0: __m128i alpha = expand_alpha_1x128 (ms); michael@0: __m128i dest = xmm_mask; michael@0: __m128i alpha_dst = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 ( michael@0: in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); michael@0: } michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src = load_128_unaligned ((__m128i*)src); michael@0: michael@0: if (!is_zero (xmm_src)) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_mask, &xmm_mask, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: dst += 4; michael@0: src += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t s = *src++; michael@0: michael@0: if (s) michael@0: { michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i ms = unpack_32_1x128 (s); michael@0: __m128i alpha = expand_alpha_1x128 (ms); michael@0: __m128i mask = xmm_mask; michael@0: __m128i dest = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 ( michael@0: in_over_1x128 (&ms, &alpha, &mask, &dest)); michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_src_x888_0565 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint16_t *dst_line, *dst; michael@0: uint32_t *src_line, *src, s; michael@0: int dst_stride, src_stride; michael@0: int32_t w; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: s = *src++; michael@0: *dst = convert_8888_to_0565 (s); michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 8) michael@0: { michael@0: __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); michael@0: __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); michael@0: michael@0: save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); michael@0: michael@0: w -= 8; michael@0: src += 8; michael@0: dst += 8; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *src++; michael@0: *dst = convert_8888_to_0565 (s); michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_src_x888_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst; michael@0: uint32_t *src_line, *src; michael@0: int32_t w; michael@0: int dst_stride, src_stride; michael@0: michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: *dst++ = *src++ | 0xff000000; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; michael@0: michael@0: xmm_src1 = load_128_unaligned ((__m128i*)src + 0); michael@0: xmm_src2 = load_128_unaligned ((__m128i*)src + 1); michael@0: xmm_src3 = load_128_unaligned ((__m128i*)src + 2); michael@0: xmm_src4 = load_128_unaligned ((__m128i*)src + 3); michael@0: michael@0: save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); michael@0: save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); michael@0: save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); michael@0: save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); michael@0: michael@0: dst += 16; michael@0: src += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: *dst++ = *src++ | 0xff000000; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst; michael@0: uint32_t *src_line, *src; michael@0: uint32_t mask; michael@0: int dst_stride, src_stride; michael@0: int32_t w; michael@0: michael@0: __m128i xmm_mask, xmm_alpha; michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); michael@0: michael@0: xmm_mask = create_mask_16_128 (mask >> 24); michael@0: xmm_alpha = mask_00ff; michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint32_t s = (*src++) | 0xff000000; michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i src = unpack_32_1x128 (s); michael@0: __m128i alpha = xmm_alpha; michael@0: __m128i mask = xmm_mask; michael@0: __m128i dest = unpack_32_1x128 (d); michael@0: michael@0: *dst++ = pack_1x128_32 ( michael@0: in_over_1x128 (&src, &alpha, &mask, &dest)); michael@0: michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src = _mm_or_si128 ( michael@0: load_128_unaligned ((__m128i*)src), mask_ff000000); michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask, &xmm_mask, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: dst += 4; michael@0: src += 4; michael@0: w -= 4; michael@0: michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t s = (*src++) | 0xff000000; michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i src = unpack_32_1x128 (s); michael@0: __m128i alpha = xmm_alpha; michael@0: __m128i mask = xmm_mask; michael@0: __m128i dest = unpack_32_1x128 (d); michael@0: michael@0: *dst++ = pack_1x128_32 ( michael@0: in_over_1x128 (&src, &alpha, &mask, &dest)); michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_8888_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: int dst_stride, src_stride; michael@0: uint32_t *dst_line, *dst; michael@0: uint32_t *src_line, *src; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: dst = dst_line; michael@0: src = src_line; michael@0: michael@0: while (height--) michael@0: { michael@0: sse2_combine_over_u (imp, op, dst, src, NULL, width); michael@0: michael@0: dst += dst_stride; michael@0: src += src_stride; michael@0: } michael@0: } michael@0: michael@0: static force_inline uint16_t michael@0: composite_over_8888_0565pixel (uint32_t src, uint16_t dst) michael@0: { michael@0: __m128i ms; michael@0: michael@0: ms = unpack_32_1x128 (src); michael@0: return pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: over_1x128 ( michael@0: ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_8888_0565 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint16_t *dst_line, *dst, d; michael@0: uint32_t *src_line, *src, s; michael@0: int dst_stride, src_stride; michael@0: int32_t w; michael@0: michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: src = src_line; michael@0: michael@0: dst_line += dst_stride; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: /* Align dst on a 16-byte boundary */ michael@0: while (w && michael@0: ((uintptr_t)dst & 15)) michael@0: { michael@0: s = *src++; michael@0: d = *dst; michael@0: michael@0: *dst++ = composite_over_8888_0565pixel (s, d); michael@0: w--; michael@0: } michael@0: michael@0: /* It's a 8 pixel loop */ michael@0: while (w >= 8) michael@0: { michael@0: /* I'm loading unaligned because I'm not sure michael@0: * about the address alignment. michael@0: */ michael@0: xmm_src = load_128_unaligned ((__m128i*) src); michael@0: xmm_dst = load_128_aligned ((__m128i*) dst); michael@0: michael@0: /* Unpacking */ michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_565_128_4x128 (xmm_dst, michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: /* I'm loading next 4 pixels from memory michael@0: * before to optimze the memory read. michael@0: */ michael@0: xmm_src = load_128_unaligned ((__m128i*) (src + 4)); michael@0: michael@0: over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst0, &xmm_dst1); michael@0: michael@0: /* Unpacking */ michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst2, &xmm_dst3); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_565_4x128_128 ( michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); michael@0: michael@0: w -= 8; michael@0: dst += 8; michael@0: src += 8; michael@0: } michael@0: michael@0: while (w--) michael@0: { michael@0: s = *src++; michael@0: d = *dst; michael@0: michael@0: *dst++ = composite_over_8888_0565pixel (s, d); michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src, srca; michael@0: uint32_t *dst_line, *dst; michael@0: uint8_t *mask_line, *mask; michael@0: int dst_stride, mask_stride; michael@0: int32_t w; michael@0: uint32_t m, d; michael@0: michael@0: __m128i xmm_src, xmm_alpha, xmm_def; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: srca = src >> 24; michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: michael@0: xmm_def = create_mask_2x32_128 (src, src); michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: xmm_alpha = expand_alpha_1x128 (xmm_src); michael@0: mmx_src = xmm_src; michael@0: mmx_alpha = xmm_alpha; michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint8_t m = *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *dst; michael@0: mmx_mask = expand_pixel_8_1x128 (m); michael@0: mmx_dest = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, michael@0: &mmx_alpha, michael@0: &mmx_mask, michael@0: &mmx_dest)); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: m = *((uint32_t*)mask); michael@0: michael@0: if (srca == 0xff && m == 0xffffffff) michael@0: { michael@0: save_128_aligned ((__m128i*)dst, xmm_def); michael@0: } michael@0: else if (m) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*) dst); michael@0: xmm_mask = unpack_32_1x128 (m); michael@0: xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); michael@0: michael@0: /* Unpacking */ michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: in_over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint8_t m = *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *dst; michael@0: mmx_mask = expand_pixel_8_1x128 (m); michael@0: mmx_dest = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, michael@0: &mmx_alpha, michael@0: &mmx_mask, michael@0: &mmx_dest)); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) michael@0: __attribute__((__force_align_arg_pointer__)) michael@0: #endif michael@0: static pixman_bool_t michael@0: sse2_fill (pixman_implementation_t *imp, michael@0: uint32_t * bits, michael@0: int stride, michael@0: int bpp, michael@0: int x, michael@0: int y, michael@0: int width, michael@0: int height, michael@0: uint32_t filler) michael@0: { michael@0: uint32_t byte_width; michael@0: uint8_t *byte_line; michael@0: michael@0: __m128i xmm_def; michael@0: michael@0: if (bpp == 8) michael@0: { michael@0: uint8_t b; michael@0: uint16_t w; michael@0: michael@0: stride = stride * (int) sizeof (uint32_t) / 1; michael@0: byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); michael@0: byte_width = width; michael@0: stride *= 1; michael@0: michael@0: b = filler & 0xff; michael@0: w = (b << 8) | b; michael@0: filler = (w << 16) | w; michael@0: } michael@0: else if (bpp == 16) michael@0: { michael@0: stride = stride * (int) sizeof (uint32_t) / 2; michael@0: byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); michael@0: byte_width = 2 * width; michael@0: stride *= 2; michael@0: michael@0: filler = (filler & 0xffff) * 0x00010001; michael@0: } michael@0: else if (bpp == 32) michael@0: { michael@0: stride = stride * (int) sizeof (uint32_t) / 4; michael@0: byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); michael@0: byte_width = 4 * width; michael@0: stride *= 4; michael@0: } michael@0: else michael@0: { michael@0: return FALSE; michael@0: } michael@0: michael@0: xmm_def = create_mask_2x32_128 (filler, filler); michael@0: michael@0: while (height--) michael@0: { michael@0: int w; michael@0: uint8_t *d = byte_line; michael@0: byte_line += stride; michael@0: w = byte_width; michael@0: michael@0: if (w >= 1 && ((uintptr_t)d & 1)) michael@0: { michael@0: *(uint8_t *)d = filler; michael@0: w -= 1; michael@0: d += 1; michael@0: } michael@0: michael@0: while (w >= 2 && ((uintptr_t)d & 3)) michael@0: { michael@0: *(uint16_t *)d = filler; michael@0: w -= 2; michael@0: d += 2; michael@0: } michael@0: michael@0: while (w >= 4 && ((uintptr_t)d & 15)) michael@0: { michael@0: *(uint32_t *)d = filler; michael@0: michael@0: w -= 4; michael@0: d += 4; michael@0: } michael@0: michael@0: while (w >= 128) michael@0: { michael@0: save_128_aligned ((__m128i*)(d), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 16), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 32), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 48), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 64), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 80), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 96), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 112), xmm_def); michael@0: michael@0: d += 128; michael@0: w -= 128; michael@0: } michael@0: michael@0: if (w >= 64) michael@0: { michael@0: save_128_aligned ((__m128i*)(d), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 16), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 32), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 48), xmm_def); michael@0: michael@0: d += 64; michael@0: w -= 64; michael@0: } michael@0: michael@0: if (w >= 32) michael@0: { michael@0: save_128_aligned ((__m128i*)(d), xmm_def); michael@0: save_128_aligned ((__m128i*)(d + 16), xmm_def); michael@0: michael@0: d += 32; michael@0: w -= 32; michael@0: } michael@0: michael@0: if (w >= 16) michael@0: { michael@0: save_128_aligned ((__m128i*)(d), xmm_def); michael@0: michael@0: d += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: *(uint32_t *)d = filler; michael@0: michael@0: w -= 4; michael@0: d += 4; michael@0: } michael@0: michael@0: if (w >= 2) michael@0: { michael@0: *(uint16_t *)d = filler; michael@0: w -= 2; michael@0: d += 2; michael@0: } michael@0: michael@0: if (w >= 1) michael@0: { michael@0: *(uint8_t *)d = filler; michael@0: w -= 1; michael@0: d += 1; michael@0: } michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src, srca; michael@0: uint32_t *dst_line, *dst; michael@0: uint8_t *mask_line, *mask; michael@0: int dst_stride, mask_stride; michael@0: int32_t w; michael@0: uint32_t m; michael@0: michael@0: __m128i xmm_src, xmm_def; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: srca = src >> 24; michael@0: if (src == 0) michael@0: { michael@0: sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, michael@0: PIXMAN_FORMAT_BPP (dest_image->bits.format), michael@0: dest_x, dest_y, width, height, 0); michael@0: return; michael@0: } michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: michael@0: xmm_def = create_mask_2x32_128 (src, src); michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint8_t m = *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: *dst = pack_1x128_32 ( michael@0: pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); michael@0: } michael@0: else michael@0: { michael@0: *dst = 0; michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: m = *((uint32_t*)mask); michael@0: michael@0: if (srca == 0xff && m == 0xffffffff) michael@0: { michael@0: save_128_aligned ((__m128i*)dst, xmm_def); michael@0: } michael@0: else if (m) michael@0: { michael@0: xmm_mask = unpack_32_1x128 (m); michael@0: xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); michael@0: michael@0: /* Unpacking */ michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); michael@0: } michael@0: else michael@0: { michael@0: save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); michael@0: } michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint8_t m = *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: *dst = pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: xmm_src, expand_pixel_8_1x128 (m))); michael@0: } michael@0: else michael@0: { michael@0: *dst = 0; michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint16_t *dst_line, *dst, d; michael@0: uint8_t *mask_line, *mask; michael@0: int dst_stride, mask_stride; michael@0: int32_t w; michael@0: uint32_t m; michael@0: __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; michael@0: michael@0: __m128i xmm_src, xmm_alpha; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: xmm_alpha = expand_alpha_1x128 (xmm_src); michael@0: mmx_src = xmm_src; michael@0: mmx_alpha = xmm_alpha; michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: m = *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *dst; michael@0: mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); michael@0: mmx_dest = expand565_16_1x128 (d); michael@0: michael@0: *dst = pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: in_over_1x128 ( michael@0: &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 8) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*) dst); michael@0: unpack_565_128_4x128 (xmm_dst, michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); michael@0: michael@0: m = *((uint32_t*)mask); michael@0: mask += 4; michael@0: michael@0: if (m) michael@0: { michael@0: xmm_mask = unpack_32_1x128 (m); michael@0: xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); michael@0: michael@0: /* Unpacking */ michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: in_over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst0, &xmm_dst1); michael@0: } michael@0: michael@0: m = *((uint32_t*)mask); michael@0: mask += 4; michael@0: michael@0: if (m) michael@0: { michael@0: xmm_mask = unpack_32_1x128 (m); michael@0: xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); michael@0: michael@0: /* Unpacking */ michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: in_over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst2, &xmm_dst3); michael@0: } michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_565_4x128_128 ( michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); michael@0: michael@0: w -= 8; michael@0: dst += 8; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: d = *dst; michael@0: mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); michael@0: mmx_dest = expand565_16_1x128 (d); michael@0: michael@0: *dst = pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: in_over_1x128 ( michael@0: &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint16_t *dst_line, *dst, d; michael@0: uint32_t *src_line, *src, s; michael@0: int dst_stride, src_stride; michael@0: int32_t w; michael@0: uint32_t opaque, zero; michael@0: michael@0: __m128i ms; michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: s = *src++; michael@0: d = *dst; michael@0: michael@0: ms = unpack_32_1x128 (s); michael@0: michael@0: *dst++ = pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 8) michael@0: { michael@0: /* First round */ michael@0: xmm_src = load_128_unaligned ((__m128i*)src); michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: opaque = is_opaque (xmm_src); michael@0: zero = is_zero (xmm_src); michael@0: michael@0: unpack_565_128_4x128 (xmm_dst, michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: /* preload next round*/ michael@0: xmm_src = load_128_unaligned ((__m128i*)(src + 4)); michael@0: michael@0: if (opaque) michael@0: { michael@0: invert_colors_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_dst0, &xmm_dst1); michael@0: } michael@0: else if (!zero) michael@0: { michael@0: over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_dst0, &xmm_dst1); michael@0: } michael@0: michael@0: /* Second round */ michael@0: opaque = is_opaque (xmm_src); michael@0: zero = is_zero (xmm_src); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: if (opaque) michael@0: { michael@0: invert_colors_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_dst2, &xmm_dst3); michael@0: } michael@0: else if (!zero) michael@0: { michael@0: over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_dst2, &xmm_dst3); michael@0: } michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_565_4x128_128 ( michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); michael@0: michael@0: w -= 8; michael@0: src += 8; michael@0: dst += 8; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *src++; michael@0: d = *dst; michael@0: michael@0: ms = unpack_32_1x128 (s); michael@0: michael@0: *dst++ = pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst, d; michael@0: uint32_t *src_line, *src, s; michael@0: int dst_stride, src_stride; michael@0: int32_t w; michael@0: uint32_t opaque, zero; michael@0: michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: s = *src++; michael@0: d = *dst; michael@0: michael@0: *dst++ = pack_1x128_32 ( michael@0: over_rev_non_pre_1x128 ( michael@0: unpack_32_1x128 (s), unpack_32_1x128 (d))); michael@0: michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_src_hi = load_128_unaligned ((__m128i*)src); michael@0: michael@0: opaque = is_opaque (xmm_src_hi); michael@0: zero = is_zero (xmm_src_hi); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: michael@0: if (opaque) michael@0: { michael@0: invert_colors_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: else if (!zero) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: src += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = *src++; michael@0: d = *dst; michael@0: michael@0: *dst++ = pack_1x128_32 ( michael@0: over_rev_non_pre_1x128 ( michael@0: unpack_32_1x128 (s), unpack_32_1x128 (d))); michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint16_t *dst_line, *dst, d; michael@0: uint32_t *mask_line, *mask, m; michael@0: int dst_stride, mask_stride; michael@0: int w; michael@0: uint32_t pack_cmp; michael@0: michael@0: __m128i xmm_src, xmm_alpha; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; michael@0: michael@0: __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); michael@0: michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: xmm_alpha = expand_alpha_1x128 (xmm_src); michael@0: mmx_src = xmm_src; michael@0: mmx_alpha = xmm_alpha; michael@0: michael@0: while (height--) michael@0: { michael@0: w = width; michael@0: mask = mask_line; michael@0: dst = dst_line; michael@0: mask_line += mask_stride; michael@0: dst_line += dst_stride; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: m = *(uint32_t *) mask; michael@0: michael@0: if (m) michael@0: { michael@0: d = *dst; michael@0: mmx_mask = unpack_32_1x128 (m); michael@0: mmx_dest = expand565_16_1x128 (d); michael@0: michael@0: *dst = pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: in_over_1x128 ( michael@0: &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: mask++; michael@0: } michael@0: michael@0: while (w >= 8) michael@0: { michael@0: /* First round */ michael@0: xmm_mask = load_128_unaligned ((__m128i*)mask); michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: pack_cmp = _mm_movemask_epi8 ( michael@0: _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); michael@0: michael@0: unpack_565_128_4x128 (xmm_dst, michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: /* preload next round */ michael@0: xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); michael@0: michael@0: /* preload next round */ michael@0: if (pack_cmp != 0xffff) michael@0: { michael@0: in_over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst0, &xmm_dst1); michael@0: } michael@0: michael@0: /* Second round */ michael@0: pack_cmp = _mm_movemask_epi8 ( michael@0: _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); michael@0: michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: if (pack_cmp != 0xffff) michael@0: { michael@0: in_over_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst2, &xmm_dst3); michael@0: } michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_565_4x128_128 ( michael@0: &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); michael@0: michael@0: w -= 8; michael@0: dst += 8; michael@0: mask += 8; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = *(uint32_t *) mask; michael@0: michael@0: if (m) michael@0: { michael@0: d = *dst; michael@0: mmx_mask = unpack_32_1x128 (m); michael@0: mmx_dest = expand565_16_1x128 (d); michael@0: michael@0: *dst = pack_565_32_16 ( michael@0: pack_1x128_32 ( michael@0: in_over_1x128 ( michael@0: &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: mask++; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_in_n_8_8 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint8_t *dst_line, *dst; michael@0: uint8_t *mask_line, *mask; michael@0: int dst_stride, mask_stride; michael@0: uint32_t d, m; michael@0: uint32_t src; michael@0: int32_t w; michael@0: michael@0: __m128i xmm_alpha; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: w = width; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: m = (uint32_t) *mask++; michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: pix_multiply_1x128 (xmm_alpha, michael@0: unpack_32_1x128 (m)), michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: xmm_mask = load_128_unaligned ((__m128i*)mask); michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: mask += 16; michael@0: dst += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = (uint32_t) *mask++; michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: pix_multiply_1x128 ( michael@0: xmm_alpha, unpack_32_1x128 (m)), michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_in_n_8 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint8_t *dst_line, *dst; michael@0: int dst_stride; michael@0: uint32_t d; michael@0: uint32_t src; michael@0: int32_t w; michael@0: michael@0: __m128i xmm_alpha; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); michael@0: michael@0: src = src >> 24; michael@0: michael@0: if (src == 0xff) michael@0: return; michael@0: michael@0: if (src == 0x00) michael@0: { michael@0: pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, michael@0: 8, dest_x, dest_y, width, height, src); michael@0: michael@0: return; michael@0: } michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: w = width; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: xmm_alpha, michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, michael@0: &xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: dst += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: xmm_alpha, michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_in_8_8 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint8_t *dst_line, *dst; michael@0: uint8_t *src_line, *src; michael@0: int src_stride, dst_stride; michael@0: int32_t w; michael@0: uint32_t s, d; michael@0: michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: s = (uint32_t) *src++; michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: pix_multiply_1x128 ( michael@0: unpack_32_1x128 (s), unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: xmm_src = load_128_unaligned ((__m128i*)src); michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: src += 16; michael@0: dst += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: s = (uint32_t) *src++; michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_n_8_8 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint8_t *dst_line, *dst; michael@0: uint8_t *mask_line, *mask; michael@0: int dst_stride, mask_stride; michael@0: int32_t w; michael@0: uint32_t src; michael@0: uint32_t m, d; michael@0: michael@0: __m128i xmm_alpha; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: w = width; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: m = (uint32_t) *mask++; michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: _mm_adds_epu16 ( michael@0: pix_multiply_1x128 ( michael@0: xmm_alpha, unpack_32_1x128 (m)), michael@0: unpack_32_1x128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: xmm_mask = load_128_unaligned ((__m128i*)mask); michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); michael@0: xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: michael@0: mask += 16; michael@0: dst += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = (uint32_t) *mask++; michael@0: d = (uint32_t) *dst; michael@0: michael@0: *dst++ = (uint8_t) pack_1x128_32 ( michael@0: _mm_adds_epu16 ( michael@0: pix_multiply_1x128 ( michael@0: xmm_alpha, unpack_32_1x128 (m)), michael@0: unpack_32_1x128 (d))); michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_n_8 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint8_t *dst_line, *dst; michael@0: int dst_stride; michael@0: int32_t w; michael@0: uint32_t src; michael@0: michael@0: __m128i xmm_src; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: src >>= 24; michael@0: michael@0: if (src == 0x00) michael@0: return; michael@0: michael@0: if (src == 0xff) michael@0: { michael@0: pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, michael@0: 8, dest_x, dest_y, width, height, 0xff); michael@0: michael@0: return; michael@0: } michael@0: michael@0: src = (src << 24) | (src << 16) | (src << 8) | src; michael@0: xmm_src = _mm_set_epi32 (src, src, src, src); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: w = width; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: *dst = (uint8_t)_mm_cvtsi128_si32 ( michael@0: _mm_adds_epu8 ( michael@0: xmm_src, michael@0: _mm_cvtsi32_si128 (*dst))); michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: save_128_aligned ( michael@0: (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); michael@0: michael@0: dst += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: *dst = (uint8_t)_mm_cvtsi128_si32 ( michael@0: _mm_adds_epu8 ( michael@0: xmm_src, michael@0: _mm_cvtsi32_si128 (*dst))); michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_8_8 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint8_t *dst_line, *dst; michael@0: uint8_t *src_line, *src; michael@0: int dst_stride, src_stride; michael@0: int32_t w; michael@0: uint16_t t; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: src = src_line; michael@0: michael@0: dst_line += dst_stride; michael@0: src_line += src_stride; michael@0: w = width; michael@0: michael@0: /* Small head */ michael@0: while (w && (uintptr_t)dst & 3) michael@0: { michael@0: t = (*dst) + (*src++); michael@0: *dst++ = t | (0 - (t >> 8)); michael@0: w--; michael@0: } michael@0: michael@0: sse2_combine_add_u (imp, op, michael@0: (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); michael@0: michael@0: /* Small tail */ michael@0: dst += w & 0xfffc; michael@0: src += w & 0xfffc; michael@0: michael@0: w &= 3; michael@0: michael@0: while (w) michael@0: { michael@0: t = (*dst) + (*src++); michael@0: *dst++ = t | (0 - (t >> 8)); michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_8888_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst; michael@0: uint32_t *src_line, *src; michael@0: int dst_stride, src_stride; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: michael@0: sse2_combine_add_u (imp, op, dst, src, NULL, width); michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_n_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst, src; michael@0: int dst_stride; michael@0: michael@0: __m128i xmm_src; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: if (src == 0) michael@0: return; michael@0: michael@0: if (src == ~0) michael@0: { michael@0: pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, michael@0: dest_x, dest_y, width, height, ~0); michael@0: michael@0: return; michael@0: } michael@0: michael@0: xmm_src = _mm_set_epi32 (src, src, src, src); michael@0: while (height--) michael@0: { michael@0: int w = width; michael@0: uint32_t d; michael@0: michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: michael@0: while (w && (unsigned long)dst & 15) michael@0: { michael@0: d = *dst; michael@0: *dst++ = michael@0: _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: save_128_aligned michael@0: ((__m128i*)dst, michael@0: _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); michael@0: michael@0: dst += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w--) michael@0: { michael@0: d = *dst; michael@0: *dst++ = michael@0: _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, michael@0: _mm_cvtsi32_si128 (d))); michael@0: } michael@0: } michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *dst_line, *dst; michael@0: uint8_t *mask_line, *mask; michael@0: int dst_stride, mask_stride; michael@0: int32_t w; michael@0: uint32_t src; michael@0: michael@0: __m128i xmm_src; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: if (src == 0) michael@0: return; michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: w = width; michael@0: michael@0: while (w && ((unsigned long)dst & 15)) michael@0: { michael@0: uint8_t m = *mask++; michael@0: if (m) michael@0: { michael@0: *dst = pack_1x128_32 michael@0: (_mm_adds_epu16 michael@0: (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), michael@0: unpack_32_1x128 (*dst))); michael@0: } michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: uint32_t m = *(uint32_t*)mask; michael@0: if (m) michael@0: { michael@0: __m128i xmm_mask_lo, xmm_mask_hi; michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: michael@0: __m128i xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: __m128i xmm_mask = michael@0: _mm_unpacklo_epi8 (unpack_32_1x128(m), michael@0: _mm_setzero_si128 ()); michael@0: michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: pix_multiply_2x128 (&xmm_src, &xmm_src, michael@0: &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); michael@0: xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint8_t m = *mask++; michael@0: if (m) michael@0: { michael@0: *dst = pack_1x128_32 michael@0: (_mm_adds_epu16 michael@0: (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), michael@0: unpack_32_1x128 (*dst))); michael@0: } michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: } michael@0: michael@0: static pixman_bool_t michael@0: sse2_blt (pixman_implementation_t *imp, michael@0: uint32_t * src_bits, michael@0: uint32_t * dst_bits, michael@0: int src_stride, michael@0: int dst_stride, michael@0: int src_bpp, michael@0: int dst_bpp, michael@0: int src_x, michael@0: int src_y, michael@0: int dest_x, michael@0: int dest_y, michael@0: int width, michael@0: int height) michael@0: { michael@0: uint8_t * src_bytes; michael@0: uint8_t * dst_bytes; michael@0: int byte_width; michael@0: michael@0: if (src_bpp != dst_bpp) michael@0: return FALSE; michael@0: michael@0: if (src_bpp == 16) michael@0: { michael@0: src_stride = src_stride * (int) sizeof (uint32_t) / 2; michael@0: dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; michael@0: src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); michael@0: dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); michael@0: byte_width = 2 * width; michael@0: src_stride *= 2; michael@0: dst_stride *= 2; michael@0: } michael@0: else if (src_bpp == 32) michael@0: { michael@0: src_stride = src_stride * (int) sizeof (uint32_t) / 4; michael@0: dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; michael@0: src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); michael@0: dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); michael@0: byte_width = 4 * width; michael@0: src_stride *= 4; michael@0: dst_stride *= 4; michael@0: } michael@0: else michael@0: { michael@0: return FALSE; michael@0: } michael@0: michael@0: while (height--) michael@0: { michael@0: int w; michael@0: uint8_t *s = src_bytes; michael@0: uint8_t *d = dst_bytes; michael@0: src_bytes += src_stride; michael@0: dst_bytes += dst_stride; michael@0: w = byte_width; michael@0: michael@0: while (w >= 2 && ((uintptr_t)d & 3)) michael@0: { michael@0: *(uint16_t *)d = *(uint16_t *)s; michael@0: w -= 2; michael@0: s += 2; michael@0: d += 2; michael@0: } michael@0: michael@0: while (w >= 4 && ((uintptr_t)d & 15)) michael@0: { michael@0: *(uint32_t *)d = *(uint32_t *)s; michael@0: michael@0: w -= 4; michael@0: s += 4; michael@0: d += 4; michael@0: } michael@0: michael@0: while (w >= 64) michael@0: { michael@0: __m128i xmm0, xmm1, xmm2, xmm3; michael@0: michael@0: xmm0 = load_128_unaligned ((__m128i*)(s)); michael@0: xmm1 = load_128_unaligned ((__m128i*)(s + 16)); michael@0: xmm2 = load_128_unaligned ((__m128i*)(s + 32)); michael@0: xmm3 = load_128_unaligned ((__m128i*)(s + 48)); michael@0: michael@0: save_128_aligned ((__m128i*)(d), xmm0); michael@0: save_128_aligned ((__m128i*)(d + 16), xmm1); michael@0: save_128_aligned ((__m128i*)(d + 32), xmm2); michael@0: save_128_aligned ((__m128i*)(d + 48), xmm3); michael@0: michael@0: s += 64; michael@0: d += 64; michael@0: w -= 64; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); michael@0: michael@0: w -= 16; michael@0: d += 16; michael@0: s += 16; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: *(uint32_t *)d = *(uint32_t *)s; michael@0: michael@0: w -= 4; michael@0: s += 4; michael@0: d += 4; michael@0: } michael@0: michael@0: if (w >= 2) michael@0: { michael@0: *(uint16_t *)d = *(uint16_t *)s; michael@0: w -= 2; michael@0: s += 2; michael@0: d += 2; michael@0: } michael@0: } michael@0: michael@0: return TRUE; michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_copy_area (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: sse2_blt (imp, src_image->bits.bits, michael@0: dest_image->bits.bits, michael@0: src_image->bits.rowstride, michael@0: dest_image->bits.rowstride, michael@0: PIXMAN_FORMAT_BPP (src_image->bits.format), michael@0: PIXMAN_FORMAT_BPP (dest_image->bits.format), michael@0: src_x, src_y, dest_x, dest_y, width, height); michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *src, *src_line, s; michael@0: uint32_t *dst, *dst_line, d; michael@0: uint8_t *mask, *mask_line; michael@0: uint32_t m; michael@0: int src_stride, mask_stride, dst_stride; michael@0: int32_t w; michael@0: __m128i ms; michael@0: michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: s = 0xff000000 | *src++; michael@0: m = (uint32_t) *mask++; michael@0: d = *dst; michael@0: ms = unpack_32_1x128 (s); michael@0: michael@0: if (m != 0xff) michael@0: { michael@0: __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); michael@0: __m128i md = unpack_32_1x128 (d); michael@0: michael@0: ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); michael@0: } michael@0: michael@0: *dst++ = pack_1x128_32 (ms); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: m = *(uint32_t*) mask; michael@0: xmm_src = _mm_or_si128 ( michael@0: load_128_unaligned ((__m128i*)src), mask_ff000000); michael@0: michael@0: if (m == 0xffffffff) michael@0: { michael@0: save_128_aligned ((__m128i*)dst, xmm_src); michael@0: } michael@0: else michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_rev_2x128 ( michael@0: xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: src += 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: m = (uint32_t) *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: s = 0xff000000 | *src; michael@0: michael@0: if (m == 0xff) michael@0: { michael@0: *dst = s; michael@0: } michael@0: else michael@0: { michael@0: __m128i ma, md, ms; michael@0: michael@0: d = *dst; michael@0: michael@0: ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); michael@0: md = unpack_32_1x128 (d); michael@0: ms = unpack_32_1x128 (s); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); michael@0: } michael@0: michael@0: } michael@0: michael@0: src++; michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *src, *src_line, s; michael@0: uint32_t *dst, *dst_line, d; michael@0: uint8_t *mask, *mask_line; michael@0: uint32_t m; michael@0: int src_stride, mask_stride, dst_stride; michael@0: int32_t w; michael@0: michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint32_t sa; michael@0: michael@0: s = *src++; michael@0: m = (uint32_t) *mask++; michael@0: d = *dst; michael@0: michael@0: sa = s >> 24; michael@0: michael@0: if (m) michael@0: { michael@0: if (sa == 0xff && m == 0xff) michael@0: { michael@0: *dst = s; michael@0: } michael@0: else michael@0: { michael@0: __m128i ms, md, ma, msa; michael@0: michael@0: ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); michael@0: ms = unpack_32_1x128 (s); michael@0: md = unpack_32_1x128 (d); michael@0: michael@0: msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); michael@0: } michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: m = *(uint32_t *) mask; michael@0: michael@0: if (m) michael@0: { michael@0: xmm_src = load_128_unaligned ((__m128i*)src); michael@0: michael@0: if (m == 0xffffffff && is_opaque (xmm_src)) michael@0: { michael@0: save_128_aligned ((__m128i *)dst, xmm_src); michael@0: } michael@0: else michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i *)dst); michael@0: michael@0: xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: } michael@0: michael@0: src += 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t sa; michael@0: michael@0: s = *src++; michael@0: m = (uint32_t) *mask++; michael@0: d = *dst; michael@0: michael@0: sa = s >> 24; michael@0: michael@0: if (m) michael@0: { michael@0: if (sa == 0xff && m == 0xff) michael@0: { michael@0: *dst = s; michael@0: } michael@0: else michael@0: { michael@0: __m128i ms, md, ma, msa; michael@0: michael@0: ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); michael@0: ms = unpack_32_1x128 (s); michael@0: md = unpack_32_1x128 (d); michael@0: michael@0: msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); michael@0: } michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t src; michael@0: uint32_t *dst_line, *dst; michael@0: __m128i xmm_src; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_dsta_hi, xmm_dsta_lo; michael@0: int dst_stride; michael@0: int32_t w; michael@0: michael@0: src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); michael@0: michael@0: if (src == 0) michael@0: return; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: michael@0: xmm_src = expand_pixel_32_1x128 (src); michael@0: michael@0: while (height--) michael@0: { michael@0: dst = dst_line; michael@0: michael@0: dst_line += dst_stride; michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: __m128i vd; michael@0: michael@0: vd = unpack_32_1x128 (*dst); michael@0: michael@0: *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), michael@0: xmm_src)); michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i tmp_lo, tmp_hi; michael@0: michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); michael@0: michael@0: tmp_lo = xmm_src; michael@0: tmp_hi = xmm_src; michael@0: michael@0: over_2x128 (&xmm_dst_lo, &xmm_dst_hi, michael@0: &xmm_dsta_lo, &xmm_dsta_hi, michael@0: &tmp_lo, &tmp_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: __m128i vd; michael@0: michael@0: vd = unpack_32_1x128 (*dst); michael@0: michael@0: *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), michael@0: xmm_src)); michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: } michael@0: michael@0: } michael@0: michael@0: static void michael@0: sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, michael@0: pixman_composite_info_t *info) michael@0: { michael@0: PIXMAN_COMPOSITE_ARGS (info); michael@0: uint32_t *src, *src_line, s; michael@0: uint32_t *dst, *dst_line, d; michael@0: uint32_t *mask, *mask_line; michael@0: uint32_t m; michael@0: int src_stride, mask_stride, dst_stride; michael@0: int32_t w; michael@0: michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); michael@0: PIXMAN_IMAGE_GET_LINE ( michael@0: src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); michael@0: michael@0: while (height--) michael@0: { michael@0: src = src_line; michael@0: src_line += src_stride; michael@0: dst = dst_line; michael@0: dst_line += dst_stride; michael@0: mask = mask_line; michael@0: mask_line += mask_stride; michael@0: michael@0: w = width; michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint32_t sa; michael@0: michael@0: s = *src++; michael@0: m = (*mask++) >> 24; michael@0: d = *dst; michael@0: michael@0: sa = s >> 24; michael@0: michael@0: if (m) michael@0: { michael@0: if (sa == 0xff && m == 0xff) michael@0: { michael@0: *dst = s; michael@0: } michael@0: else michael@0: { michael@0: __m128i ms, md, ma, msa; michael@0: michael@0: ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); michael@0: ms = unpack_32_1x128 (s); michael@0: md = unpack_32_1x128 (d); michael@0: michael@0: msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); michael@0: } michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: xmm_mask = load_128_unaligned ((__m128i*)mask); michael@0: michael@0: if (!is_transparent (xmm_mask)) michael@0: { michael@0: xmm_src = load_128_unaligned ((__m128i*)src); michael@0: michael@0: if (is_opaque (xmm_mask) && is_opaque (xmm_src)) michael@0: { michael@0: save_128_aligned ((__m128i *)dst, xmm_src); michael@0: } michael@0: else michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i *)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); michael@0: expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: } michael@0: michael@0: src += 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t sa; michael@0: michael@0: s = *src++; michael@0: m = (*mask++) >> 24; michael@0: d = *dst; michael@0: michael@0: sa = s >> 24; michael@0: michael@0: if (m) michael@0: { michael@0: if (sa == 0xff && m == 0xff) michael@0: { michael@0: *dst = s; michael@0: } michael@0: else michael@0: { michael@0: __m128i ms, md, ma, msa; michael@0: michael@0: ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); michael@0: ms = unpack_32_1x128 (s); michael@0: md = unpack_32_1x128 (d); michael@0: michael@0: msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); michael@0: } michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: } michael@0: michael@0: /* A variant of 'sse2_combine_over_u' with minor tweaks */ michael@0: static force_inline void michael@0: scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, michael@0: const uint32_t* ps, michael@0: int32_t w, michael@0: pixman_fixed_t vx, michael@0: pixman_fixed_t unit_x, michael@0: pixman_fixed_t src_width_fixed, michael@0: pixman_bool_t fully_transparent_src) michael@0: { michael@0: uint32_t s, d; michael@0: const uint32_t* pm = NULL; michael@0: michael@0: __m128i xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: michael@0: if (fully_transparent_src) michael@0: return; michael@0: michael@0: /* Align dst on a 16-byte boundary */ michael@0: while (w && ((uintptr_t)pd & 15)) michael@0: { michael@0: d = *pd; michael@0: s = combine1 (ps + pixman_fixed_to_int (vx), pm); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: michael@0: *pd++ = core_combine_over_u_pixel_sse2 (s, d); michael@0: if (pm) michael@0: pm++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i tmp; michael@0: uint32_t tmp1, tmp2, tmp3, tmp4; michael@0: michael@0: tmp1 = *(ps + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: tmp2 = *(ps + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: tmp3 = *(ps + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: tmp4 = *(ps + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: michael@0: tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); michael@0: michael@0: xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); michael@0: michael@0: if (is_opaque (xmm_src_hi)) michael@0: { michael@0: save_128_aligned ((__m128i*)pd, xmm_src_hi); michael@0: } michael@0: else if (!is_zero (xmm_src_hi)) michael@0: { michael@0: xmm_dst_hi = load_128_aligned ((__m128i*) pd); michael@0: michael@0: unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 ( michael@0: xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: /* rebuid the 4 pixel data and save*/ michael@0: save_128_aligned ((__m128i*)pd, michael@0: pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: w -= 4; michael@0: pd += 4; michael@0: if (pm) michael@0: pm += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: d = *pd; michael@0: s = combine1 (ps + pixman_fixed_to_int (vx), pm); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: michael@0: *pd++ = core_combine_over_u_pixel_sse2 (s, d); michael@0: if (pm) michael@0: pm++; michael@0: michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, michael@0: scaled_nearest_scanline_sse2_8888_8888_OVER, michael@0: uint32_t, uint32_t, COVER) michael@0: FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, michael@0: scaled_nearest_scanline_sse2_8888_8888_OVER, michael@0: uint32_t, uint32_t, NONE) michael@0: FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, michael@0: scaled_nearest_scanline_sse2_8888_8888_OVER, michael@0: uint32_t, uint32_t, PAD) michael@0: FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, michael@0: scaled_nearest_scanline_sse2_8888_8888_OVER, michael@0: uint32_t, uint32_t, NORMAL) michael@0: michael@0: static force_inline void michael@0: scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, michael@0: uint32_t * dst, michael@0: const uint32_t * src, michael@0: int32_t w, michael@0: pixman_fixed_t vx, michael@0: pixman_fixed_t unit_x, michael@0: pixman_fixed_t src_width_fixed, michael@0: pixman_bool_t zero_src) michael@0: { michael@0: __m128i xmm_mask; michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: michael@0: if (zero_src || (*mask >> 24) == 0) michael@0: return; michael@0: michael@0: xmm_mask = create_mask_16_128 (*mask >> 24); michael@0: michael@0: while (w && (uintptr_t)dst & 15) michael@0: { michael@0: uint32_t s = *(src + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: michael@0: if (s) michael@0: { michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i ms = unpack_32_1x128 (s); michael@0: __m128i alpha = expand_alpha_1x128 (ms); michael@0: __m128i dest = xmm_mask; michael@0: __m128i alpha_dst = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 ( michael@0: in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); michael@0: } michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: uint32_t tmp1, tmp2, tmp3, tmp4; michael@0: michael@0: tmp1 = *(src + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: tmp2 = *(src + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: tmp3 = *(src + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: tmp4 = *(src + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: michael@0: xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); michael@0: michael@0: if (!is_zero (xmm_src)) michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_mask, &xmm_mask, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ( michael@0: (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: dst += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t s = *(src + pixman_fixed_to_int (vx)); michael@0: vx += unit_x; michael@0: while (vx >= 0) michael@0: vx -= src_width_fixed; michael@0: michael@0: if (s) michael@0: { michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i ms = unpack_32_1x128 (s); michael@0: __m128i alpha = expand_alpha_1x128 (ms); michael@0: __m128i mask = xmm_mask; michael@0: __m128i dest = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 ( michael@0: in_over_1x128 (&ms, &alpha, &mask, &dest)); michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: } michael@0: michael@0: FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, michael@0: scaled_nearest_scanline_sse2_8888_n_8888_OVER, michael@0: uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) michael@0: FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, michael@0: scaled_nearest_scanline_sse2_8888_n_8888_OVER, michael@0: uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) michael@0: FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, michael@0: scaled_nearest_scanline_sse2_8888_n_8888_OVER, michael@0: uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) michael@0: FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, michael@0: scaled_nearest_scanline_sse2_8888_n_8888_OVER, michael@0: uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) michael@0: michael@0: #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) michael@0: michael@0: #define BILINEAR_DECLARE_VARIABLES \ michael@0: const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ michael@0: const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ michael@0: const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\ michael@0: const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ michael@0: const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\ michael@0: const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ michael@0: const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ michael@0: unit_x, unit_x, unit_x, unit_x); \ michael@0: const __m128i xmm_zero = _mm_setzero_si128 (); \ michael@0: __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx) michael@0: michael@0: #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ michael@0: do { \ michael@0: __m128i xmm_wh, xmm_lo, xmm_hi, a; \ michael@0: /* fetch 2x2 pixel block into sse2 registers */ \ michael@0: __m128i tltr = _mm_loadl_epi64 ( \ michael@0: (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ michael@0: __m128i blbr = _mm_loadl_epi64 ( \ michael@0: (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ michael@0: vx += unit_x; \ michael@0: /* vertical interpolation */ \ michael@0: a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ michael@0: xmm_wt), \ michael@0: _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ michael@0: xmm_wb)); \ michael@0: if (BILINEAR_INTERPOLATION_BITS < 8) \ michael@0: { \ michael@0: /* calculate horizontal weights */ \ michael@0: xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \ michael@0: _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ michael@0: xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ michael@0: /* horizontal interpolation */ \ michael@0: a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ michael@0: a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ michael@0: } \ michael@0: else \ michael@0: { \ michael@0: /* calculate horizontal weights */ \ michael@0: xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \ michael@0: _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ michael@0: xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ michael@0: /* horizontal interpolation */ \ michael@0: xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ michael@0: xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ michael@0: a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ michael@0: _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ michael@0: } \ michael@0: /* shift and pack the result */ \ michael@0: a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ michael@0: a = _mm_packs_epi32 (a, a); \ michael@0: a = _mm_packus_epi16 (a, a); \ michael@0: pix = _mm_cvtsi128_si32 (a); \ michael@0: } while (0) michael@0: michael@0: #define BILINEAR_SKIP_ONE_PIXEL() \ michael@0: do { \ michael@0: vx += unit_x; \ michael@0: xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ michael@0: } while(0) michael@0: michael@0: static force_inline void michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, michael@0: const uint32_t * mask, michael@0: const uint32_t * src_top, michael@0: const uint32_t * src_bottom, michael@0: int32_t w, michael@0: int wt, michael@0: int wb, michael@0: pixman_fixed_t vx, michael@0: pixman_fixed_t unit_x, michael@0: pixman_fixed_t max_vx, michael@0: pixman_bool_t zero_src) michael@0: { michael@0: BILINEAR_DECLARE_VARIABLES; michael@0: uint32_t pix1, pix2, pix3, pix4; michael@0: michael@0: while ((w -= 4) >= 0) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); michael@0: *dst++ = pix1; michael@0: *dst++ = pix2; michael@0: *dst++ = pix3; michael@0: *dst++ = pix4; michael@0: } michael@0: michael@0: if (w & 2) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); michael@0: *dst++ = pix1; michael@0: *dst++ = pix2; michael@0: } michael@0: michael@0: if (w & 1) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: *dst = pix1; michael@0: } michael@0: michael@0: } michael@0: michael@0: /* Add extra NULL argument to the existing bilinear fast paths to indicate michael@0: * that we don't need two-pass processing */ michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: static force_inline void michael@0: scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, michael@0: const uint32_t * mask, michael@0: const uint32_t * src_top, michael@0: const uint32_t * src_bottom, michael@0: int32_t w, michael@0: int wt, michael@0: int wb, michael@0: pixman_fixed_t vx, michael@0: pixman_fixed_t unit_x, michael@0: pixman_fixed_t max_vx, michael@0: pixman_bool_t zero_src) michael@0: { michael@0: BILINEAR_DECLARE_VARIABLES; michael@0: uint32_t pix1, pix2, pix3, pix4; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: michael@0: if (pix1) michael@0: { michael@0: pix2 = *dst; michael@0: *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i xmm_src; michael@0: __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; michael@0: __m128i xmm_alpha_hi, xmm_alpha_lo; michael@0: michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); michael@0: michael@0: xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); michael@0: michael@0: if (!is_zero (xmm_src)) michael@0: { michael@0: if (is_opaque (xmm_src)) michael@0: { michael@0: save_128_aligned ((__m128i *)dst, xmm_src); michael@0: } michael@0: else michael@0: { michael@0: __m128i xmm_dst = load_128_aligned ((__m128i *)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); michael@0: over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: } michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: michael@0: if (pix1) michael@0: { michael@0: pix2 = *dst; michael@0: *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: michael@0: /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented michael@0: as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ michael@0: michael@0: void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) michael@0: { michael@0: /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ michael@0: while (--width >= 0) michael@0: { michael@0: *dst = composite_over_8888_0565pixel (*src, *dst); michael@0: src++; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: uint32_t, uint32_t, uint16_t, michael@0: COVER, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: uint32_t, uint32_t, uint16_t, michael@0: PAD, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: uint32_t, uint32_t, uint16_t, michael@0: NONE, FLAG_NONE) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, michael@0: uint32_t, uint32_t, uint16_t, michael@0: NORMAL, FLAG_NONE) michael@0: michael@0: /*****************************/ michael@0: michael@0: static force_inline void michael@0: scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, michael@0: const uint8_t * mask, michael@0: const uint32_t * src_top, michael@0: const uint32_t * src_bottom, michael@0: int32_t w, michael@0: int wt, michael@0: int wb, michael@0: pixman_fixed_t vx, michael@0: pixman_fixed_t unit_x, michael@0: pixman_fixed_t max_vx, michael@0: pixman_bool_t zero_src) michael@0: { michael@0: BILINEAR_DECLARE_VARIABLES; michael@0: uint32_t pix1, pix2, pix3, pix4; michael@0: uint32_t m; michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: uint32_t sa; michael@0: michael@0: m = (uint32_t) *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: sa = pix1 >> 24; michael@0: michael@0: if (sa == 0xff && m == 0xff) michael@0: { michael@0: *dst = pix1; michael@0: } michael@0: else michael@0: { michael@0: __m128i ms, md, ma, msa; michael@0: michael@0: pix2 = *dst; michael@0: ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); michael@0: ms = unpack_32_1x128 (pix1); michael@0: md = unpack_32_1x128 (pix2); michael@0: michael@0: msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); michael@0: } michael@0: } michael@0: else michael@0: { michael@0: BILINEAR_SKIP_ONE_PIXEL (); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; michael@0: michael@0: m = *(uint32_t*)mask; michael@0: michael@0: if (m) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); michael@0: michael@0: xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); michael@0: michael@0: if (m == 0xffffffff && is_opaque (xmm_src)) michael@0: { michael@0: save_128_aligned ((__m128i *)dst, xmm_src); michael@0: } michael@0: else michael@0: { michael@0: xmm_dst = load_128_aligned ((__m128i *)dst); michael@0: michael@0: xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); michael@0: expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, michael@0: &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: } michael@0: else michael@0: { michael@0: BILINEAR_SKIP_ONE_PIXEL (); michael@0: BILINEAR_SKIP_ONE_PIXEL (); michael@0: BILINEAR_SKIP_ONE_PIXEL (); michael@0: BILINEAR_SKIP_ONE_PIXEL (); michael@0: } michael@0: michael@0: w -= 4; michael@0: dst += 4; michael@0: mask += 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint32_t sa; michael@0: michael@0: m = (uint32_t) *mask++; michael@0: michael@0: if (m) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: sa = pix1 >> 24; michael@0: michael@0: if (sa == 0xff && m == 0xff) michael@0: { michael@0: *dst = pix1; michael@0: } michael@0: else michael@0: { michael@0: __m128i ms, md, ma, msa; michael@0: michael@0: pix2 = *dst; michael@0: ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); michael@0: ms = unpack_32_1x128 (pix1); michael@0: md = unpack_32_1x128 (pix2); michael@0: michael@0: msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); michael@0: michael@0: *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); michael@0: } michael@0: } michael@0: else michael@0: { michael@0: BILINEAR_SKIP_ONE_PIXEL (); michael@0: } michael@0: michael@0: w--; michael@0: dst++; michael@0: } michael@0: } michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: COVER, FLAG_HAVE_NON_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: PAD, FLAG_HAVE_NON_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: NONE, FLAG_HAVE_NON_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, michael@0: uint32_t, uint8_t, uint32_t, michael@0: NORMAL, FLAG_HAVE_NON_SOLID_MASK) michael@0: michael@0: static force_inline void michael@0: scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, michael@0: const uint32_t * mask, michael@0: const uint32_t * src_top, michael@0: const uint32_t * src_bottom, michael@0: int32_t w, michael@0: int wt, michael@0: int wb, michael@0: pixman_fixed_t vx, michael@0: pixman_fixed_t unit_x, michael@0: pixman_fixed_t max_vx, michael@0: pixman_bool_t zero_src) michael@0: { michael@0: BILINEAR_DECLARE_VARIABLES; michael@0: uint32_t pix1, pix2, pix3, pix4; michael@0: __m128i xmm_mask; michael@0: michael@0: if (zero_src || (*mask >> 24) == 0) michael@0: return; michael@0: michael@0: xmm_mask = create_mask_16_128 (*mask >> 24); michael@0: michael@0: while (w && ((uintptr_t)dst & 15)) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: if (pix1) michael@0: { michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i ms = unpack_32_1x128 (pix1); michael@0: __m128i alpha = expand_alpha_1x128 (ms); michael@0: __m128i dest = xmm_mask; michael@0: __m128i alpha_dst = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 michael@0: (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); michael@0: michael@0: if (pix1 | pix2 | pix3 | pix4) michael@0: { michael@0: __m128i xmm_src, xmm_src_lo, xmm_src_hi; michael@0: __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; michael@0: __m128i xmm_alpha_lo, xmm_alpha_hi; michael@0: michael@0: xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); michael@0: michael@0: xmm_dst = load_128_aligned ((__m128i*)dst); michael@0: michael@0: unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); michael@0: unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); michael@0: expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi); michael@0: michael@0: in_over_2x128 (&xmm_src_lo, &xmm_src_hi, michael@0: &xmm_alpha_lo, &xmm_alpha_hi, michael@0: &xmm_mask, &xmm_mask, michael@0: &xmm_dst_lo, &xmm_dst_hi); michael@0: michael@0: save_128_aligned michael@0: ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); michael@0: } michael@0: michael@0: dst += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); michael@0: if (pix1) michael@0: { michael@0: uint32_t d = *dst; michael@0: michael@0: __m128i ms = unpack_32_1x128 (pix1); michael@0: __m128i alpha = expand_alpha_1x128 (ms); michael@0: __m128i dest = xmm_mask; michael@0: __m128i alpha_dst = unpack_32_1x128 (d); michael@0: michael@0: *dst = pack_1x128_32 michael@0: (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); michael@0: } michael@0: michael@0: dst++; michael@0: w--; michael@0: } michael@0: } michael@0: michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: COVER, FLAG_HAVE_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: PAD, FLAG_HAVE_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NONE, FLAG_HAVE_SOLID_MASK) michael@0: FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, michael@0: scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, michael@0: uint32_t, uint32_t, uint32_t, michael@0: NORMAL, FLAG_HAVE_SOLID_MASK) michael@0: michael@0: static const pixman_fast_path_t sse2_fast_paths[] = michael@0: { michael@0: /* PIXMAN_OP_OVER */ michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), michael@0: PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), michael@0: PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), michael@0: PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), michael@0: PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), michael@0: PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), michael@0: PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), michael@0: PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), michael@0: michael@0: /* PIXMAN_OP_OVER_REVERSE */ michael@0: PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), michael@0: PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), michael@0: michael@0: /* PIXMAN_OP_ADD */ michael@0: PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), michael@0: PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), michael@0: PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), michael@0: michael@0: /* PIXMAN_OP_SRC */ michael@0: PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), michael@0: PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), michael@0: michael@0: /* PIXMAN_OP_IN */ michael@0: PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), michael@0: PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), michael@0: PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), michael@0: michael@0: SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), michael@0: SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), michael@0: michael@0: SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), michael@0: michael@0: SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), michael@0: SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), michael@0: SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), michael@0: SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), michael@0: michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), michael@0: SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), michael@0: michael@0: /* and here the needed entries are added to the fast path table */ michael@0: michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), michael@0: SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), michael@0: michael@0: { PIXMAN_OP_NONE }, michael@0: }; michael@0: michael@0: static uint32_t * michael@0: sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) michael@0: { michael@0: int w = iter->width; michael@0: __m128i ff000000 = mask_ff000000; michael@0: uint32_t *dst = iter->buffer; michael@0: uint32_t *src = (uint32_t *)iter->bits; michael@0: michael@0: iter->bits += iter->stride; michael@0: michael@0: while (w && ((uintptr_t)dst) & 0x0f) michael@0: { michael@0: *dst++ = (*src++) | 0xff000000; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 4) michael@0: { michael@0: save_128_aligned ( michael@0: (__m128i *)dst, _mm_or_si128 ( michael@0: load_128_unaligned ((__m128i *)src), ff000000)); michael@0: michael@0: dst += 4; michael@0: src += 4; michael@0: w -= 4; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: *dst++ = (*src++) | 0xff000000; michael@0: w--; michael@0: } michael@0: michael@0: return iter->buffer; michael@0: } michael@0: michael@0: static uint32_t * michael@0: sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) michael@0: { michael@0: int w = iter->width; michael@0: uint32_t *dst = iter->buffer; michael@0: uint16_t *src = (uint16_t *)iter->bits; michael@0: __m128i ff000000 = mask_ff000000; michael@0: michael@0: iter->bits += iter->stride; michael@0: michael@0: while (w && ((uintptr_t)dst) & 0x0f) michael@0: { michael@0: uint16_t s = *src++; michael@0: michael@0: *dst++ = convert_0565_to_8888 (s); michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 8) michael@0: { michael@0: __m128i lo, hi, s; michael@0: michael@0: s = _mm_loadu_si128 ((__m128i *)src); michael@0: michael@0: lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); michael@0: hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); michael@0: michael@0: save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); michael@0: save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); michael@0: michael@0: dst += 8; michael@0: src += 8; michael@0: w -= 8; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: uint16_t s = *src++; michael@0: michael@0: *dst++ = convert_0565_to_8888 (s); michael@0: w--; michael@0: } michael@0: michael@0: return iter->buffer; michael@0: } michael@0: michael@0: static uint32_t * michael@0: sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) michael@0: { michael@0: int w = iter->width; michael@0: uint32_t *dst = iter->buffer; michael@0: uint8_t *src = iter->bits; michael@0: __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; michael@0: michael@0: iter->bits += iter->stride; michael@0: michael@0: while (w && (((uintptr_t)dst) & 15)) michael@0: { michael@0: *dst++ = *(src++) << 24; michael@0: w--; michael@0: } michael@0: michael@0: while (w >= 16) michael@0: { michael@0: xmm0 = _mm_loadu_si128((__m128i *)src); michael@0: michael@0: xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); michael@0: xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); michael@0: xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); michael@0: xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); michael@0: xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); michael@0: xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); michael@0: michael@0: _mm_store_si128(((__m128i *)(dst + 0)), xmm3); michael@0: _mm_store_si128(((__m128i *)(dst + 4)), xmm4); michael@0: _mm_store_si128(((__m128i *)(dst + 8)), xmm5); michael@0: _mm_store_si128(((__m128i *)(dst + 12)), xmm6); michael@0: michael@0: dst += 16; michael@0: src += 16; michael@0: w -= 16; michael@0: } michael@0: michael@0: while (w) michael@0: { michael@0: *dst++ = *(src++) << 24; michael@0: w--; michael@0: } michael@0: michael@0: return iter->buffer; michael@0: } michael@0: michael@0: typedef struct michael@0: { michael@0: pixman_format_code_t format; michael@0: pixman_iter_get_scanline_t get_scanline; michael@0: } fetcher_info_t; michael@0: michael@0: static const fetcher_info_t fetchers[] = michael@0: { michael@0: { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, michael@0: { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, michael@0: { PIXMAN_a8, sse2_fetch_a8 }, michael@0: { PIXMAN_null } michael@0: }; michael@0: michael@0: static pixman_bool_t michael@0: sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) michael@0: { michael@0: pixman_image_t *image = iter->image; michael@0: michael@0: #define FLAGS \ michael@0: (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ michael@0: FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) michael@0: michael@0: if ((iter->iter_flags & ITER_NARROW) && michael@0: (iter->image_flags & FLAGS) == FLAGS) michael@0: { michael@0: const fetcher_info_t *f; michael@0: michael@0: for (f = &fetchers[0]; f->format != PIXMAN_null; f++) michael@0: { michael@0: if (image->common.extended_format_code == f->format) michael@0: { michael@0: uint8_t *b = (uint8_t *)image->bits.bits; michael@0: int s = image->bits.rowstride * 4; michael@0: michael@0: iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; michael@0: iter->stride = s; michael@0: michael@0: iter->get_scanline = f->get_scanline; michael@0: return TRUE; michael@0: } michael@0: } michael@0: } michael@0: michael@0: return FALSE; michael@0: } michael@0: michael@0: #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) michael@0: __attribute__((__force_align_arg_pointer__)) michael@0: #endif michael@0: pixman_implementation_t * michael@0: _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) michael@0: { michael@0: pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); michael@0: michael@0: /* SSE2 constants */ michael@0: mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); michael@0: mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); michael@0: mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); michael@0: mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); michael@0: mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); michael@0: mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); michael@0: mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); michael@0: mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); michael@0: mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); michael@0: mask_0080 = create_mask_16_128 (0x0080); michael@0: mask_00ff = create_mask_16_128 (0x00ff); michael@0: mask_0101 = create_mask_16_128 (0x0101); michael@0: mask_ffff = create_mask_16_128 (0xffff); michael@0: mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); michael@0: mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); michael@0: mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); michael@0: mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); michael@0: michael@0: /* Set up function pointers */ michael@0: imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; michael@0: imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; michael@0: imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; michael@0: imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; michael@0: imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; michael@0: imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; michael@0: imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; michael@0: imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; michael@0: imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; michael@0: imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; michael@0: michael@0: imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; michael@0: michael@0: imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; michael@0: imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; michael@0: michael@0: imp->blt = sse2_blt; michael@0: imp->fill = sse2_fill; michael@0: michael@0: imp->src_iter_init = sse2_src_iter_init; michael@0: michael@0: return imp; michael@0: }