Wed, 31 Dec 2014 06:09:35 +0100
Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.
michael@0 | 1 | /* |
michael@0 | 2 | * Copyright © 2008 Rodrigo Kumpera |
michael@0 | 3 | * Copyright © 2008 André Tupinambá |
michael@0 | 4 | * |
michael@0 | 5 | * Permission to use, copy, modify, distribute, and sell this software and its |
michael@0 | 6 | * documentation for any purpose is hereby granted without fee, provided that |
michael@0 | 7 | * the above copyright notice appear in all copies and that both that |
michael@0 | 8 | * copyright notice and this permission notice appear in supporting |
michael@0 | 9 | * documentation, and that the name of Red Hat not be used in advertising or |
michael@0 | 10 | * publicity pertaining to distribution of the software without specific, |
michael@0 | 11 | * written prior permission. Red Hat makes no representations about the |
michael@0 | 12 | * suitability of this software for any purpose. It is provided "as is" |
michael@0 | 13 | * without express or implied warranty. |
michael@0 | 14 | * |
michael@0 | 15 | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
michael@0 | 16 | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
michael@0 | 17 | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
michael@0 | 18 | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
michael@0 | 19 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
michael@0 | 20 | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
michael@0 | 21 | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
michael@0 | 22 | * SOFTWARE. |
michael@0 | 23 | * |
michael@0 | 24 | * Author: Rodrigo Kumpera (kumpera@gmail.com) |
michael@0 | 25 | * André Tupinambá (andrelrt@gmail.com) |
michael@0 | 26 | * |
michael@0 | 27 | * Based on work by Owen Taylor and Søren Sandmann |
michael@0 | 28 | */ |
michael@0 | 29 | #ifdef HAVE_CONFIG_H |
michael@0 | 30 | #include <config.h> |
michael@0 | 31 | #endif |
michael@0 | 32 | |
michael@0 | 33 | #include <xmmintrin.h> /* for _mm_shuffle_pi16 and _MM_SHUFFLE */ |
michael@0 | 34 | #include <emmintrin.h> /* for SSE2 intrinsics */ |
michael@0 | 35 | #include "pixman-private.h" |
michael@0 | 36 | #include "pixman-combine32.h" |
michael@0 | 37 | #include "pixman-inlines.h" |
michael@0 | 38 | |
michael@0 | 39 | static __m128i mask_0080; |
michael@0 | 40 | static __m128i mask_00ff; |
michael@0 | 41 | static __m128i mask_0101; |
michael@0 | 42 | static __m128i mask_ffff; |
michael@0 | 43 | static __m128i mask_ff000000; |
michael@0 | 44 | static __m128i mask_alpha; |
michael@0 | 45 | |
michael@0 | 46 | static __m128i mask_565_r; |
michael@0 | 47 | static __m128i mask_565_g1, mask_565_g2; |
michael@0 | 48 | static __m128i mask_565_b; |
michael@0 | 49 | static __m128i mask_red; |
michael@0 | 50 | static __m128i mask_green; |
michael@0 | 51 | static __m128i mask_blue; |
michael@0 | 52 | |
michael@0 | 53 | static __m128i mask_565_fix_rb; |
michael@0 | 54 | static __m128i mask_565_fix_g; |
michael@0 | 55 | |
michael@0 | 56 | static __m128i mask_565_rb; |
michael@0 | 57 | static __m128i mask_565_pack_multiplier; |
michael@0 | 58 | |
michael@0 | 59 | static force_inline __m128i |
michael@0 | 60 | unpack_32_1x128 (uint32_t data) |
michael@0 | 61 | { |
michael@0 | 62 | return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ()); |
michael@0 | 63 | } |
michael@0 | 64 | |
michael@0 | 65 | static force_inline void |
michael@0 | 66 | unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi) |
michael@0 | 67 | { |
michael@0 | 68 | *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ()); |
michael@0 | 69 | *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ()); |
michael@0 | 70 | } |
michael@0 | 71 | |
michael@0 | 72 | static force_inline __m128i |
michael@0 | 73 | unpack_565_to_8888 (__m128i lo) |
michael@0 | 74 | { |
michael@0 | 75 | __m128i r, g, b, rb, t; |
michael@0 | 76 | |
michael@0 | 77 | r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red); |
michael@0 | 78 | g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green); |
michael@0 | 79 | b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue); |
michael@0 | 80 | |
michael@0 | 81 | rb = _mm_or_si128 (r, b); |
michael@0 | 82 | t = _mm_and_si128 (rb, mask_565_fix_rb); |
michael@0 | 83 | t = _mm_srli_epi32 (t, 5); |
michael@0 | 84 | rb = _mm_or_si128 (rb, t); |
michael@0 | 85 | |
michael@0 | 86 | t = _mm_and_si128 (g, mask_565_fix_g); |
michael@0 | 87 | t = _mm_srli_epi32 (t, 6); |
michael@0 | 88 | g = _mm_or_si128 (g, t); |
michael@0 | 89 | |
michael@0 | 90 | return _mm_or_si128 (rb, g); |
michael@0 | 91 | } |
michael@0 | 92 | |
michael@0 | 93 | static force_inline void |
michael@0 | 94 | unpack_565_128_4x128 (__m128i data, |
michael@0 | 95 | __m128i* data0, |
michael@0 | 96 | __m128i* data1, |
michael@0 | 97 | __m128i* data2, |
michael@0 | 98 | __m128i* data3) |
michael@0 | 99 | { |
michael@0 | 100 | __m128i lo, hi; |
michael@0 | 101 | |
michael@0 | 102 | lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ()); |
michael@0 | 103 | hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ()); |
michael@0 | 104 | |
michael@0 | 105 | lo = unpack_565_to_8888 (lo); |
michael@0 | 106 | hi = unpack_565_to_8888 (hi); |
michael@0 | 107 | |
michael@0 | 108 | unpack_128_2x128 (lo, data0, data1); |
michael@0 | 109 | unpack_128_2x128 (hi, data2, data3); |
michael@0 | 110 | } |
michael@0 | 111 | |
michael@0 | 112 | static force_inline uint16_t |
michael@0 | 113 | pack_565_32_16 (uint32_t pixel) |
michael@0 | 114 | { |
michael@0 | 115 | return (uint16_t) (((pixel >> 8) & 0xf800) | |
michael@0 | 116 | ((pixel >> 5) & 0x07e0) | |
michael@0 | 117 | ((pixel >> 3) & 0x001f)); |
michael@0 | 118 | } |
michael@0 | 119 | |
michael@0 | 120 | static force_inline __m128i |
michael@0 | 121 | pack_2x128_128 (__m128i lo, __m128i hi) |
michael@0 | 122 | { |
michael@0 | 123 | return _mm_packus_epi16 (lo, hi); |
michael@0 | 124 | } |
michael@0 | 125 | |
michael@0 | 126 | static force_inline __m128i |
michael@0 | 127 | pack_565_2packedx128_128 (__m128i lo, __m128i hi) |
michael@0 | 128 | { |
michael@0 | 129 | __m128i rb0 = _mm_and_si128 (lo, mask_565_rb); |
michael@0 | 130 | __m128i rb1 = _mm_and_si128 (hi, mask_565_rb); |
michael@0 | 131 | |
michael@0 | 132 | __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier); |
michael@0 | 133 | __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier); |
michael@0 | 134 | |
michael@0 | 135 | __m128i g0 = _mm_and_si128 (lo, mask_green); |
michael@0 | 136 | __m128i g1 = _mm_and_si128 (hi, mask_green); |
michael@0 | 137 | |
michael@0 | 138 | t0 = _mm_or_si128 (t0, g0); |
michael@0 | 139 | t1 = _mm_or_si128 (t1, g1); |
michael@0 | 140 | |
michael@0 | 141 | /* Simulates _mm_packus_epi32 */ |
michael@0 | 142 | t0 = _mm_slli_epi32 (t0, 16 - 5); |
michael@0 | 143 | t1 = _mm_slli_epi32 (t1, 16 - 5); |
michael@0 | 144 | t0 = _mm_srai_epi32 (t0, 16); |
michael@0 | 145 | t1 = _mm_srai_epi32 (t1, 16); |
michael@0 | 146 | return _mm_packs_epi32 (t0, t1); |
michael@0 | 147 | } |
michael@0 | 148 | |
michael@0 | 149 | static force_inline __m128i |
michael@0 | 150 | pack_565_2x128_128 (__m128i lo, __m128i hi) |
michael@0 | 151 | { |
michael@0 | 152 | __m128i data; |
michael@0 | 153 | __m128i r, g1, g2, b; |
michael@0 | 154 | |
michael@0 | 155 | data = pack_2x128_128 (lo, hi); |
michael@0 | 156 | |
michael@0 | 157 | r = _mm_and_si128 (data, mask_565_r); |
michael@0 | 158 | g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1); |
michael@0 | 159 | g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2); |
michael@0 | 160 | b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b); |
michael@0 | 161 | |
michael@0 | 162 | return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b); |
michael@0 | 163 | } |
michael@0 | 164 | |
michael@0 | 165 | static force_inline __m128i |
michael@0 | 166 | pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3) |
michael@0 | 167 | { |
michael@0 | 168 | return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1), |
michael@0 | 169 | pack_565_2x128_128 (*xmm2, *xmm3)); |
michael@0 | 170 | } |
michael@0 | 171 | |
michael@0 | 172 | static force_inline int |
michael@0 | 173 | is_opaque (__m128i x) |
michael@0 | 174 | { |
michael@0 | 175 | __m128i ffs = _mm_cmpeq_epi8 (x, x); |
michael@0 | 176 | |
michael@0 | 177 | return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888; |
michael@0 | 178 | } |
michael@0 | 179 | |
michael@0 | 180 | static force_inline int |
michael@0 | 181 | is_zero (__m128i x) |
michael@0 | 182 | { |
michael@0 | 183 | return _mm_movemask_epi8 ( |
michael@0 | 184 | _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff; |
michael@0 | 185 | } |
michael@0 | 186 | |
michael@0 | 187 | static force_inline int |
michael@0 | 188 | is_transparent (__m128i x) |
michael@0 | 189 | { |
michael@0 | 190 | return (_mm_movemask_epi8 ( |
michael@0 | 191 | _mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888; |
michael@0 | 192 | } |
michael@0 | 193 | |
michael@0 | 194 | static force_inline __m128i |
michael@0 | 195 | expand_pixel_32_1x128 (uint32_t data) |
michael@0 | 196 | { |
michael@0 | 197 | return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0)); |
michael@0 | 198 | } |
michael@0 | 199 | |
michael@0 | 200 | static force_inline __m128i |
michael@0 | 201 | expand_alpha_1x128 (__m128i data) |
michael@0 | 202 | { |
michael@0 | 203 | return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data, |
michael@0 | 204 | _MM_SHUFFLE (3, 3, 3, 3)), |
michael@0 | 205 | _MM_SHUFFLE (3, 3, 3, 3)); |
michael@0 | 206 | } |
michael@0 | 207 | |
michael@0 | 208 | static force_inline void |
michael@0 | 209 | expand_alpha_2x128 (__m128i data_lo, |
michael@0 | 210 | __m128i data_hi, |
michael@0 | 211 | __m128i* alpha_lo, |
michael@0 | 212 | __m128i* alpha_hi) |
michael@0 | 213 | { |
michael@0 | 214 | __m128i lo, hi; |
michael@0 | 215 | |
michael@0 | 216 | lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3)); |
michael@0 | 217 | hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3)); |
michael@0 | 218 | |
michael@0 | 219 | *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3)); |
michael@0 | 220 | *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3)); |
michael@0 | 221 | } |
michael@0 | 222 | |
michael@0 | 223 | static force_inline void |
michael@0 | 224 | expand_alpha_rev_2x128 (__m128i data_lo, |
michael@0 | 225 | __m128i data_hi, |
michael@0 | 226 | __m128i* alpha_lo, |
michael@0 | 227 | __m128i* alpha_hi) |
michael@0 | 228 | { |
michael@0 | 229 | __m128i lo, hi; |
michael@0 | 230 | |
michael@0 | 231 | lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0)); |
michael@0 | 232 | hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0)); |
michael@0 | 233 | *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0)); |
michael@0 | 234 | *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0)); |
michael@0 | 235 | } |
michael@0 | 236 | |
michael@0 | 237 | static force_inline void |
michael@0 | 238 | pix_multiply_2x128 (__m128i* data_lo, |
michael@0 | 239 | __m128i* data_hi, |
michael@0 | 240 | __m128i* alpha_lo, |
michael@0 | 241 | __m128i* alpha_hi, |
michael@0 | 242 | __m128i* ret_lo, |
michael@0 | 243 | __m128i* ret_hi) |
michael@0 | 244 | { |
michael@0 | 245 | __m128i lo, hi; |
michael@0 | 246 | |
michael@0 | 247 | lo = _mm_mullo_epi16 (*data_lo, *alpha_lo); |
michael@0 | 248 | hi = _mm_mullo_epi16 (*data_hi, *alpha_hi); |
michael@0 | 249 | lo = _mm_adds_epu16 (lo, mask_0080); |
michael@0 | 250 | hi = _mm_adds_epu16 (hi, mask_0080); |
michael@0 | 251 | *ret_lo = _mm_mulhi_epu16 (lo, mask_0101); |
michael@0 | 252 | *ret_hi = _mm_mulhi_epu16 (hi, mask_0101); |
michael@0 | 253 | } |
michael@0 | 254 | |
michael@0 | 255 | static force_inline void |
michael@0 | 256 | pix_add_multiply_2x128 (__m128i* src_lo, |
michael@0 | 257 | __m128i* src_hi, |
michael@0 | 258 | __m128i* alpha_dst_lo, |
michael@0 | 259 | __m128i* alpha_dst_hi, |
michael@0 | 260 | __m128i* dst_lo, |
michael@0 | 261 | __m128i* dst_hi, |
michael@0 | 262 | __m128i* alpha_src_lo, |
michael@0 | 263 | __m128i* alpha_src_hi, |
michael@0 | 264 | __m128i* ret_lo, |
michael@0 | 265 | __m128i* ret_hi) |
michael@0 | 266 | { |
michael@0 | 267 | __m128i t1_lo, t1_hi; |
michael@0 | 268 | __m128i t2_lo, t2_hi; |
michael@0 | 269 | |
michael@0 | 270 | pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi); |
michael@0 | 271 | pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi); |
michael@0 | 272 | |
michael@0 | 273 | *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo); |
michael@0 | 274 | *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi); |
michael@0 | 275 | } |
michael@0 | 276 | |
michael@0 | 277 | static force_inline void |
michael@0 | 278 | negate_2x128 (__m128i data_lo, |
michael@0 | 279 | __m128i data_hi, |
michael@0 | 280 | __m128i* neg_lo, |
michael@0 | 281 | __m128i* neg_hi) |
michael@0 | 282 | { |
michael@0 | 283 | *neg_lo = _mm_xor_si128 (data_lo, mask_00ff); |
michael@0 | 284 | *neg_hi = _mm_xor_si128 (data_hi, mask_00ff); |
michael@0 | 285 | } |
michael@0 | 286 | |
michael@0 | 287 | static force_inline void |
michael@0 | 288 | invert_colors_2x128 (__m128i data_lo, |
michael@0 | 289 | __m128i data_hi, |
michael@0 | 290 | __m128i* inv_lo, |
michael@0 | 291 | __m128i* inv_hi) |
michael@0 | 292 | { |
michael@0 | 293 | __m128i lo, hi; |
michael@0 | 294 | |
michael@0 | 295 | lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2)); |
michael@0 | 296 | hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2)); |
michael@0 | 297 | *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2)); |
michael@0 | 298 | *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2)); |
michael@0 | 299 | } |
michael@0 | 300 | |
michael@0 | 301 | static force_inline void |
michael@0 | 302 | over_2x128 (__m128i* src_lo, |
michael@0 | 303 | __m128i* src_hi, |
michael@0 | 304 | __m128i* alpha_lo, |
michael@0 | 305 | __m128i* alpha_hi, |
michael@0 | 306 | __m128i* dst_lo, |
michael@0 | 307 | __m128i* dst_hi) |
michael@0 | 308 | { |
michael@0 | 309 | __m128i t1, t2; |
michael@0 | 310 | |
michael@0 | 311 | negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2); |
michael@0 | 312 | |
michael@0 | 313 | pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi); |
michael@0 | 314 | |
michael@0 | 315 | *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo); |
michael@0 | 316 | *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi); |
michael@0 | 317 | } |
michael@0 | 318 | |
michael@0 | 319 | static force_inline void |
michael@0 | 320 | over_rev_non_pre_2x128 (__m128i src_lo, |
michael@0 | 321 | __m128i src_hi, |
michael@0 | 322 | __m128i* dst_lo, |
michael@0 | 323 | __m128i* dst_hi) |
michael@0 | 324 | { |
michael@0 | 325 | __m128i lo, hi; |
michael@0 | 326 | __m128i alpha_lo, alpha_hi; |
michael@0 | 327 | |
michael@0 | 328 | expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi); |
michael@0 | 329 | |
michael@0 | 330 | lo = _mm_or_si128 (alpha_lo, mask_alpha); |
michael@0 | 331 | hi = _mm_or_si128 (alpha_hi, mask_alpha); |
michael@0 | 332 | |
michael@0 | 333 | invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi); |
michael@0 | 334 | |
michael@0 | 335 | pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi); |
michael@0 | 336 | |
michael@0 | 337 | over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi); |
michael@0 | 338 | } |
michael@0 | 339 | |
michael@0 | 340 | static force_inline void |
michael@0 | 341 | in_over_2x128 (__m128i* src_lo, |
michael@0 | 342 | __m128i* src_hi, |
michael@0 | 343 | __m128i* alpha_lo, |
michael@0 | 344 | __m128i* alpha_hi, |
michael@0 | 345 | __m128i* mask_lo, |
michael@0 | 346 | __m128i* mask_hi, |
michael@0 | 347 | __m128i* dst_lo, |
michael@0 | 348 | __m128i* dst_hi) |
michael@0 | 349 | { |
michael@0 | 350 | __m128i s_lo, s_hi; |
michael@0 | 351 | __m128i a_lo, a_hi; |
michael@0 | 352 | |
michael@0 | 353 | pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi); |
michael@0 | 354 | pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi); |
michael@0 | 355 | |
michael@0 | 356 | over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi); |
michael@0 | 357 | } |
michael@0 | 358 | |
michael@0 | 359 | /* load 4 pixels from a 16-byte boundary aligned address */ |
michael@0 | 360 | static force_inline __m128i |
michael@0 | 361 | load_128_aligned (__m128i* src) |
michael@0 | 362 | { |
michael@0 | 363 | return _mm_load_si128 (src); |
michael@0 | 364 | } |
michael@0 | 365 | |
michael@0 | 366 | /* load 4 pixels from a unaligned address */ |
michael@0 | 367 | static force_inline __m128i |
michael@0 | 368 | load_128_unaligned (const __m128i* src) |
michael@0 | 369 | { |
michael@0 | 370 | return _mm_loadu_si128 (src); |
michael@0 | 371 | } |
michael@0 | 372 | |
michael@0 | 373 | /* save 4 pixels using Write Combining memory on a 16-byte |
michael@0 | 374 | * boundary aligned address |
michael@0 | 375 | */ |
michael@0 | 376 | static force_inline void |
michael@0 | 377 | save_128_write_combining (__m128i* dst, |
michael@0 | 378 | __m128i data) |
michael@0 | 379 | { |
michael@0 | 380 | _mm_stream_si128 (dst, data); |
michael@0 | 381 | } |
michael@0 | 382 | |
michael@0 | 383 | /* save 4 pixels on a 16-byte boundary aligned address */ |
michael@0 | 384 | static force_inline void |
michael@0 | 385 | save_128_aligned (__m128i* dst, |
michael@0 | 386 | __m128i data) |
michael@0 | 387 | { |
michael@0 | 388 | _mm_store_si128 (dst, data); |
michael@0 | 389 | } |
michael@0 | 390 | |
michael@0 | 391 | /* save 4 pixels on a unaligned address */ |
michael@0 | 392 | static force_inline void |
michael@0 | 393 | save_128_unaligned (__m128i* dst, |
michael@0 | 394 | __m128i data) |
michael@0 | 395 | { |
michael@0 | 396 | _mm_storeu_si128 (dst, data); |
michael@0 | 397 | } |
michael@0 | 398 | |
michael@0 | 399 | static force_inline __m128i |
michael@0 | 400 | load_32_1x128 (uint32_t data) |
michael@0 | 401 | { |
michael@0 | 402 | return _mm_cvtsi32_si128 (data); |
michael@0 | 403 | } |
michael@0 | 404 | |
michael@0 | 405 | static force_inline __m128i |
michael@0 | 406 | expand_alpha_rev_1x128 (__m128i data) |
michael@0 | 407 | { |
michael@0 | 408 | return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0)); |
michael@0 | 409 | } |
michael@0 | 410 | |
michael@0 | 411 | static force_inline __m128i |
michael@0 | 412 | expand_pixel_8_1x128 (uint8_t data) |
michael@0 | 413 | { |
michael@0 | 414 | return _mm_shufflelo_epi16 ( |
michael@0 | 415 | unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0)); |
michael@0 | 416 | } |
michael@0 | 417 | |
michael@0 | 418 | static force_inline __m128i |
michael@0 | 419 | pix_multiply_1x128 (__m128i data, |
michael@0 | 420 | __m128i alpha) |
michael@0 | 421 | { |
michael@0 | 422 | return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha), |
michael@0 | 423 | mask_0080), |
michael@0 | 424 | mask_0101); |
michael@0 | 425 | } |
michael@0 | 426 | |
michael@0 | 427 | static force_inline __m128i |
michael@0 | 428 | pix_add_multiply_1x128 (__m128i* src, |
michael@0 | 429 | __m128i* alpha_dst, |
michael@0 | 430 | __m128i* dst, |
michael@0 | 431 | __m128i* alpha_src) |
michael@0 | 432 | { |
michael@0 | 433 | __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst); |
michael@0 | 434 | __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src); |
michael@0 | 435 | |
michael@0 | 436 | return _mm_adds_epu8 (t1, t2); |
michael@0 | 437 | } |
michael@0 | 438 | |
michael@0 | 439 | static force_inline __m128i |
michael@0 | 440 | negate_1x128 (__m128i data) |
michael@0 | 441 | { |
michael@0 | 442 | return _mm_xor_si128 (data, mask_00ff); |
michael@0 | 443 | } |
michael@0 | 444 | |
michael@0 | 445 | static force_inline __m128i |
michael@0 | 446 | invert_colors_1x128 (__m128i data) |
michael@0 | 447 | { |
michael@0 | 448 | return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2)); |
michael@0 | 449 | } |
michael@0 | 450 | |
michael@0 | 451 | static force_inline __m128i |
michael@0 | 452 | over_1x128 (__m128i src, __m128i alpha, __m128i dst) |
michael@0 | 453 | { |
michael@0 | 454 | return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha))); |
michael@0 | 455 | } |
michael@0 | 456 | |
michael@0 | 457 | static force_inline __m128i |
michael@0 | 458 | in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst) |
michael@0 | 459 | { |
michael@0 | 460 | return over_1x128 (pix_multiply_1x128 (*src, *mask), |
michael@0 | 461 | pix_multiply_1x128 (*alpha, *mask), |
michael@0 | 462 | *dst); |
michael@0 | 463 | } |
michael@0 | 464 | |
michael@0 | 465 | static force_inline __m128i |
michael@0 | 466 | over_rev_non_pre_1x128 (__m128i src, __m128i dst) |
michael@0 | 467 | { |
michael@0 | 468 | __m128i alpha = expand_alpha_1x128 (src); |
michael@0 | 469 | |
michael@0 | 470 | return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src), |
michael@0 | 471 | _mm_or_si128 (alpha, mask_alpha)), |
michael@0 | 472 | alpha, |
michael@0 | 473 | dst); |
michael@0 | 474 | } |
michael@0 | 475 | |
michael@0 | 476 | static force_inline uint32_t |
michael@0 | 477 | pack_1x128_32 (__m128i data) |
michael@0 | 478 | { |
michael@0 | 479 | return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ())); |
michael@0 | 480 | } |
michael@0 | 481 | |
michael@0 | 482 | static force_inline __m128i |
michael@0 | 483 | expand565_16_1x128 (uint16_t pixel) |
michael@0 | 484 | { |
michael@0 | 485 | __m128i m = _mm_cvtsi32_si128 (pixel); |
michael@0 | 486 | |
michael@0 | 487 | m = unpack_565_to_8888 (m); |
michael@0 | 488 | |
michael@0 | 489 | return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ()); |
michael@0 | 490 | } |
michael@0 | 491 | |
michael@0 | 492 | static force_inline uint32_t |
michael@0 | 493 | core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst) |
michael@0 | 494 | { |
michael@0 | 495 | uint8_t a; |
michael@0 | 496 | __m128i xmms; |
michael@0 | 497 | |
michael@0 | 498 | a = src >> 24; |
michael@0 | 499 | |
michael@0 | 500 | if (a == 0xff) |
michael@0 | 501 | { |
michael@0 | 502 | return src; |
michael@0 | 503 | } |
michael@0 | 504 | else if (src) |
michael@0 | 505 | { |
michael@0 | 506 | xmms = unpack_32_1x128 (src); |
michael@0 | 507 | return pack_1x128_32 ( |
michael@0 | 508 | over_1x128 (xmms, expand_alpha_1x128 (xmms), |
michael@0 | 509 | unpack_32_1x128 (dst))); |
michael@0 | 510 | } |
michael@0 | 511 | |
michael@0 | 512 | return dst; |
michael@0 | 513 | } |
michael@0 | 514 | |
michael@0 | 515 | static force_inline uint32_t |
michael@0 | 516 | combine1 (const uint32_t *ps, const uint32_t *pm) |
michael@0 | 517 | { |
michael@0 | 518 | uint32_t s = *ps; |
michael@0 | 519 | |
michael@0 | 520 | if (pm) |
michael@0 | 521 | { |
michael@0 | 522 | __m128i ms, mm; |
michael@0 | 523 | |
michael@0 | 524 | mm = unpack_32_1x128 (*pm); |
michael@0 | 525 | mm = expand_alpha_1x128 (mm); |
michael@0 | 526 | |
michael@0 | 527 | ms = unpack_32_1x128 (s); |
michael@0 | 528 | ms = pix_multiply_1x128 (ms, mm); |
michael@0 | 529 | |
michael@0 | 530 | s = pack_1x128_32 (ms); |
michael@0 | 531 | } |
michael@0 | 532 | |
michael@0 | 533 | return s; |
michael@0 | 534 | } |
michael@0 | 535 | |
michael@0 | 536 | static force_inline __m128i |
michael@0 | 537 | combine4 (const __m128i *ps, const __m128i *pm) |
michael@0 | 538 | { |
michael@0 | 539 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 540 | __m128i xmm_msk_lo, xmm_msk_hi; |
michael@0 | 541 | __m128i s; |
michael@0 | 542 | |
michael@0 | 543 | if (pm) |
michael@0 | 544 | { |
michael@0 | 545 | xmm_msk_lo = load_128_unaligned (pm); |
michael@0 | 546 | |
michael@0 | 547 | if (is_transparent (xmm_msk_lo)) |
michael@0 | 548 | return _mm_setzero_si128 (); |
michael@0 | 549 | } |
michael@0 | 550 | |
michael@0 | 551 | s = load_128_unaligned (ps); |
michael@0 | 552 | |
michael@0 | 553 | if (pm) |
michael@0 | 554 | { |
michael@0 | 555 | unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 556 | unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi); |
michael@0 | 557 | |
michael@0 | 558 | expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi); |
michael@0 | 559 | |
michael@0 | 560 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 561 | &xmm_msk_lo, &xmm_msk_hi, |
michael@0 | 562 | &xmm_src_lo, &xmm_src_hi); |
michael@0 | 563 | |
michael@0 | 564 | s = pack_2x128_128 (xmm_src_lo, xmm_src_hi); |
michael@0 | 565 | } |
michael@0 | 566 | |
michael@0 | 567 | return s; |
michael@0 | 568 | } |
michael@0 | 569 | |
michael@0 | 570 | static force_inline void |
michael@0 | 571 | core_combine_over_u_sse2_mask (uint32_t * pd, |
michael@0 | 572 | const uint32_t* ps, |
michael@0 | 573 | const uint32_t* pm, |
michael@0 | 574 | int w) |
michael@0 | 575 | { |
michael@0 | 576 | uint32_t s, d; |
michael@0 | 577 | |
michael@0 | 578 | /* Align dst on a 16-byte boundary */ |
michael@0 | 579 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 580 | { |
michael@0 | 581 | d = *pd; |
michael@0 | 582 | s = combine1 (ps, pm); |
michael@0 | 583 | |
michael@0 | 584 | if (s) |
michael@0 | 585 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
michael@0 | 586 | pd++; |
michael@0 | 587 | ps++; |
michael@0 | 588 | pm++; |
michael@0 | 589 | w--; |
michael@0 | 590 | } |
michael@0 | 591 | |
michael@0 | 592 | while (w >= 4) |
michael@0 | 593 | { |
michael@0 | 594 | __m128i mask = load_128_unaligned ((__m128i *)pm); |
michael@0 | 595 | |
michael@0 | 596 | if (!is_zero (mask)) |
michael@0 | 597 | { |
michael@0 | 598 | __m128i src; |
michael@0 | 599 | __m128i src_hi, src_lo; |
michael@0 | 600 | __m128i mask_hi, mask_lo; |
michael@0 | 601 | __m128i alpha_hi, alpha_lo; |
michael@0 | 602 | |
michael@0 | 603 | src = load_128_unaligned ((__m128i *)ps); |
michael@0 | 604 | |
michael@0 | 605 | if (is_opaque (_mm_and_si128 (src, mask))) |
michael@0 | 606 | { |
michael@0 | 607 | save_128_aligned ((__m128i *)pd, src); |
michael@0 | 608 | } |
michael@0 | 609 | else |
michael@0 | 610 | { |
michael@0 | 611 | __m128i dst = load_128_aligned ((__m128i *)pd); |
michael@0 | 612 | __m128i dst_hi, dst_lo; |
michael@0 | 613 | |
michael@0 | 614 | unpack_128_2x128 (mask, &mask_lo, &mask_hi); |
michael@0 | 615 | unpack_128_2x128 (src, &src_lo, &src_hi); |
michael@0 | 616 | |
michael@0 | 617 | expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi); |
michael@0 | 618 | pix_multiply_2x128 (&src_lo, &src_hi, |
michael@0 | 619 | &mask_lo, &mask_hi, |
michael@0 | 620 | &src_lo, &src_hi); |
michael@0 | 621 | |
michael@0 | 622 | unpack_128_2x128 (dst, &dst_lo, &dst_hi); |
michael@0 | 623 | |
michael@0 | 624 | expand_alpha_2x128 (src_lo, src_hi, |
michael@0 | 625 | &alpha_lo, &alpha_hi); |
michael@0 | 626 | |
michael@0 | 627 | over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, |
michael@0 | 628 | &dst_lo, &dst_hi); |
michael@0 | 629 | |
michael@0 | 630 | save_128_aligned ( |
michael@0 | 631 | (__m128i *)pd, |
michael@0 | 632 | pack_2x128_128 (dst_lo, dst_hi)); |
michael@0 | 633 | } |
michael@0 | 634 | } |
michael@0 | 635 | |
michael@0 | 636 | pm += 4; |
michael@0 | 637 | ps += 4; |
michael@0 | 638 | pd += 4; |
michael@0 | 639 | w -= 4; |
michael@0 | 640 | } |
michael@0 | 641 | while (w) |
michael@0 | 642 | { |
michael@0 | 643 | d = *pd; |
michael@0 | 644 | s = combine1 (ps, pm); |
michael@0 | 645 | |
michael@0 | 646 | if (s) |
michael@0 | 647 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
michael@0 | 648 | pd++; |
michael@0 | 649 | ps++; |
michael@0 | 650 | pm++; |
michael@0 | 651 | |
michael@0 | 652 | w--; |
michael@0 | 653 | } |
michael@0 | 654 | } |
michael@0 | 655 | |
michael@0 | 656 | static force_inline void |
michael@0 | 657 | core_combine_over_u_sse2_no_mask (uint32_t * pd, |
michael@0 | 658 | const uint32_t* ps, |
michael@0 | 659 | int w) |
michael@0 | 660 | { |
michael@0 | 661 | uint32_t s, d; |
michael@0 | 662 | |
michael@0 | 663 | /* Align dst on a 16-byte boundary */ |
michael@0 | 664 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 665 | { |
michael@0 | 666 | d = *pd; |
michael@0 | 667 | s = *ps; |
michael@0 | 668 | |
michael@0 | 669 | if (s) |
michael@0 | 670 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
michael@0 | 671 | pd++; |
michael@0 | 672 | ps++; |
michael@0 | 673 | w--; |
michael@0 | 674 | } |
michael@0 | 675 | |
michael@0 | 676 | while (w >= 4) |
michael@0 | 677 | { |
michael@0 | 678 | __m128i src; |
michael@0 | 679 | __m128i src_hi, src_lo, dst_hi, dst_lo; |
michael@0 | 680 | __m128i alpha_hi, alpha_lo; |
michael@0 | 681 | |
michael@0 | 682 | src = load_128_unaligned ((__m128i *)ps); |
michael@0 | 683 | |
michael@0 | 684 | if (!is_zero (src)) |
michael@0 | 685 | { |
michael@0 | 686 | if (is_opaque (src)) |
michael@0 | 687 | { |
michael@0 | 688 | save_128_aligned ((__m128i *)pd, src); |
michael@0 | 689 | } |
michael@0 | 690 | else |
michael@0 | 691 | { |
michael@0 | 692 | __m128i dst = load_128_aligned ((__m128i *)pd); |
michael@0 | 693 | |
michael@0 | 694 | unpack_128_2x128 (src, &src_lo, &src_hi); |
michael@0 | 695 | unpack_128_2x128 (dst, &dst_lo, &dst_hi); |
michael@0 | 696 | |
michael@0 | 697 | expand_alpha_2x128 (src_lo, src_hi, |
michael@0 | 698 | &alpha_lo, &alpha_hi); |
michael@0 | 699 | over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi, |
michael@0 | 700 | &dst_lo, &dst_hi); |
michael@0 | 701 | |
michael@0 | 702 | save_128_aligned ( |
michael@0 | 703 | (__m128i *)pd, |
michael@0 | 704 | pack_2x128_128 (dst_lo, dst_hi)); |
michael@0 | 705 | } |
michael@0 | 706 | } |
michael@0 | 707 | |
michael@0 | 708 | ps += 4; |
michael@0 | 709 | pd += 4; |
michael@0 | 710 | w -= 4; |
michael@0 | 711 | } |
michael@0 | 712 | while (w) |
michael@0 | 713 | { |
michael@0 | 714 | d = *pd; |
michael@0 | 715 | s = *ps; |
michael@0 | 716 | |
michael@0 | 717 | if (s) |
michael@0 | 718 | *pd = core_combine_over_u_pixel_sse2 (s, d); |
michael@0 | 719 | pd++; |
michael@0 | 720 | ps++; |
michael@0 | 721 | |
michael@0 | 722 | w--; |
michael@0 | 723 | } |
michael@0 | 724 | } |
michael@0 | 725 | |
michael@0 | 726 | static force_inline void |
michael@0 | 727 | sse2_combine_over_u (pixman_implementation_t *imp, |
michael@0 | 728 | pixman_op_t op, |
michael@0 | 729 | uint32_t * pd, |
michael@0 | 730 | const uint32_t * ps, |
michael@0 | 731 | const uint32_t * pm, |
michael@0 | 732 | int w) |
michael@0 | 733 | { |
michael@0 | 734 | if (pm) |
michael@0 | 735 | core_combine_over_u_sse2_mask (pd, ps, pm, w); |
michael@0 | 736 | else |
michael@0 | 737 | core_combine_over_u_sse2_no_mask (pd, ps, w); |
michael@0 | 738 | } |
michael@0 | 739 | |
michael@0 | 740 | static void |
michael@0 | 741 | sse2_combine_over_reverse_u (pixman_implementation_t *imp, |
michael@0 | 742 | pixman_op_t op, |
michael@0 | 743 | uint32_t * pd, |
michael@0 | 744 | const uint32_t * ps, |
michael@0 | 745 | const uint32_t * pm, |
michael@0 | 746 | int w) |
michael@0 | 747 | { |
michael@0 | 748 | uint32_t s, d; |
michael@0 | 749 | |
michael@0 | 750 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 751 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 752 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 753 | |
michael@0 | 754 | /* Align dst on a 16-byte boundary */ |
michael@0 | 755 | while (w && |
michael@0 | 756 | ((uintptr_t)pd & 15)) |
michael@0 | 757 | { |
michael@0 | 758 | d = *pd; |
michael@0 | 759 | s = combine1 (ps, pm); |
michael@0 | 760 | |
michael@0 | 761 | *pd++ = core_combine_over_u_pixel_sse2 (d, s); |
michael@0 | 762 | w--; |
michael@0 | 763 | ps++; |
michael@0 | 764 | if (pm) |
michael@0 | 765 | pm++; |
michael@0 | 766 | } |
michael@0 | 767 | |
michael@0 | 768 | while (w >= 4) |
michael@0 | 769 | { |
michael@0 | 770 | /* I'm loading unaligned because I'm not sure |
michael@0 | 771 | * about the address alignment. |
michael@0 | 772 | */ |
michael@0 | 773 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
michael@0 | 774 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 775 | |
michael@0 | 776 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 777 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 778 | |
michael@0 | 779 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 780 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 781 | |
michael@0 | 782 | over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 783 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 784 | &xmm_src_lo, &xmm_src_hi); |
michael@0 | 785 | |
michael@0 | 786 | /* rebuid the 4 pixel data and save*/ |
michael@0 | 787 | save_128_aligned ((__m128i*)pd, |
michael@0 | 788 | pack_2x128_128 (xmm_src_lo, xmm_src_hi)); |
michael@0 | 789 | |
michael@0 | 790 | w -= 4; |
michael@0 | 791 | ps += 4; |
michael@0 | 792 | pd += 4; |
michael@0 | 793 | |
michael@0 | 794 | if (pm) |
michael@0 | 795 | pm += 4; |
michael@0 | 796 | } |
michael@0 | 797 | |
michael@0 | 798 | while (w) |
michael@0 | 799 | { |
michael@0 | 800 | d = *pd; |
michael@0 | 801 | s = combine1 (ps, pm); |
michael@0 | 802 | |
michael@0 | 803 | *pd++ = core_combine_over_u_pixel_sse2 (d, s); |
michael@0 | 804 | ps++; |
michael@0 | 805 | w--; |
michael@0 | 806 | if (pm) |
michael@0 | 807 | pm++; |
michael@0 | 808 | } |
michael@0 | 809 | } |
michael@0 | 810 | |
michael@0 | 811 | static force_inline uint32_t |
michael@0 | 812 | core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst) |
michael@0 | 813 | { |
michael@0 | 814 | uint32_t maska = src >> 24; |
michael@0 | 815 | |
michael@0 | 816 | if (maska == 0) |
michael@0 | 817 | { |
michael@0 | 818 | return 0; |
michael@0 | 819 | } |
michael@0 | 820 | else if (maska != 0xff) |
michael@0 | 821 | { |
michael@0 | 822 | return pack_1x128_32 ( |
michael@0 | 823 | pix_multiply_1x128 (unpack_32_1x128 (dst), |
michael@0 | 824 | expand_alpha_1x128 (unpack_32_1x128 (src)))); |
michael@0 | 825 | } |
michael@0 | 826 | |
michael@0 | 827 | return dst; |
michael@0 | 828 | } |
michael@0 | 829 | |
michael@0 | 830 | static void |
michael@0 | 831 | sse2_combine_in_u (pixman_implementation_t *imp, |
michael@0 | 832 | pixman_op_t op, |
michael@0 | 833 | uint32_t * pd, |
michael@0 | 834 | const uint32_t * ps, |
michael@0 | 835 | const uint32_t * pm, |
michael@0 | 836 | int w) |
michael@0 | 837 | { |
michael@0 | 838 | uint32_t s, d; |
michael@0 | 839 | |
michael@0 | 840 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 841 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 842 | |
michael@0 | 843 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 844 | { |
michael@0 | 845 | s = combine1 (ps, pm); |
michael@0 | 846 | d = *pd; |
michael@0 | 847 | |
michael@0 | 848 | *pd++ = core_combine_in_u_pixel_sse2 (d, s); |
michael@0 | 849 | w--; |
michael@0 | 850 | ps++; |
michael@0 | 851 | if (pm) |
michael@0 | 852 | pm++; |
michael@0 | 853 | } |
michael@0 | 854 | |
michael@0 | 855 | while (w >= 4) |
michael@0 | 856 | { |
michael@0 | 857 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 858 | xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm); |
michael@0 | 859 | |
michael@0 | 860 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 861 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 862 | |
michael@0 | 863 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 864 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 865 | &xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 866 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 867 | |
michael@0 | 868 | save_128_aligned ((__m128i*)pd, |
michael@0 | 869 | pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 870 | |
michael@0 | 871 | ps += 4; |
michael@0 | 872 | pd += 4; |
michael@0 | 873 | w -= 4; |
michael@0 | 874 | if (pm) |
michael@0 | 875 | pm += 4; |
michael@0 | 876 | } |
michael@0 | 877 | |
michael@0 | 878 | while (w) |
michael@0 | 879 | { |
michael@0 | 880 | s = combine1 (ps, pm); |
michael@0 | 881 | d = *pd; |
michael@0 | 882 | |
michael@0 | 883 | *pd++ = core_combine_in_u_pixel_sse2 (d, s); |
michael@0 | 884 | w--; |
michael@0 | 885 | ps++; |
michael@0 | 886 | if (pm) |
michael@0 | 887 | pm++; |
michael@0 | 888 | } |
michael@0 | 889 | } |
michael@0 | 890 | |
michael@0 | 891 | static void |
michael@0 | 892 | sse2_combine_in_reverse_u (pixman_implementation_t *imp, |
michael@0 | 893 | pixman_op_t op, |
michael@0 | 894 | uint32_t * pd, |
michael@0 | 895 | const uint32_t * ps, |
michael@0 | 896 | const uint32_t * pm, |
michael@0 | 897 | int w) |
michael@0 | 898 | { |
michael@0 | 899 | uint32_t s, d; |
michael@0 | 900 | |
michael@0 | 901 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 902 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 903 | |
michael@0 | 904 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 905 | { |
michael@0 | 906 | s = combine1 (ps, pm); |
michael@0 | 907 | d = *pd; |
michael@0 | 908 | |
michael@0 | 909 | *pd++ = core_combine_in_u_pixel_sse2 (s, d); |
michael@0 | 910 | ps++; |
michael@0 | 911 | w--; |
michael@0 | 912 | if (pm) |
michael@0 | 913 | pm++; |
michael@0 | 914 | } |
michael@0 | 915 | |
michael@0 | 916 | while (w >= 4) |
michael@0 | 917 | { |
michael@0 | 918 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 919 | xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); |
michael@0 | 920 | |
michael@0 | 921 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 922 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 923 | |
michael@0 | 924 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 925 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 926 | &xmm_src_lo, &xmm_src_hi, |
michael@0 | 927 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 928 | |
michael@0 | 929 | save_128_aligned ( |
michael@0 | 930 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 931 | |
michael@0 | 932 | ps += 4; |
michael@0 | 933 | pd += 4; |
michael@0 | 934 | w -= 4; |
michael@0 | 935 | if (pm) |
michael@0 | 936 | pm += 4; |
michael@0 | 937 | } |
michael@0 | 938 | |
michael@0 | 939 | while (w) |
michael@0 | 940 | { |
michael@0 | 941 | s = combine1 (ps, pm); |
michael@0 | 942 | d = *pd; |
michael@0 | 943 | |
michael@0 | 944 | *pd++ = core_combine_in_u_pixel_sse2 (s, d); |
michael@0 | 945 | w--; |
michael@0 | 946 | ps++; |
michael@0 | 947 | if (pm) |
michael@0 | 948 | pm++; |
michael@0 | 949 | } |
michael@0 | 950 | } |
michael@0 | 951 | |
michael@0 | 952 | static void |
michael@0 | 953 | sse2_combine_out_reverse_u (pixman_implementation_t *imp, |
michael@0 | 954 | pixman_op_t op, |
michael@0 | 955 | uint32_t * pd, |
michael@0 | 956 | const uint32_t * ps, |
michael@0 | 957 | const uint32_t * pm, |
michael@0 | 958 | int w) |
michael@0 | 959 | { |
michael@0 | 960 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 961 | { |
michael@0 | 962 | uint32_t s = combine1 (ps, pm); |
michael@0 | 963 | uint32_t d = *pd; |
michael@0 | 964 | |
michael@0 | 965 | *pd++ = pack_1x128_32 ( |
michael@0 | 966 | pix_multiply_1x128 ( |
michael@0 | 967 | unpack_32_1x128 (d), negate_1x128 ( |
michael@0 | 968 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
michael@0 | 969 | |
michael@0 | 970 | if (pm) |
michael@0 | 971 | pm++; |
michael@0 | 972 | ps++; |
michael@0 | 973 | w--; |
michael@0 | 974 | } |
michael@0 | 975 | |
michael@0 | 976 | while (w >= 4) |
michael@0 | 977 | { |
michael@0 | 978 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 979 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 980 | |
michael@0 | 981 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
michael@0 | 982 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 983 | |
michael@0 | 984 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 985 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 986 | |
michael@0 | 987 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 988 | negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 989 | |
michael@0 | 990 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 991 | &xmm_src_lo, &xmm_src_hi, |
michael@0 | 992 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 993 | |
michael@0 | 994 | save_128_aligned ( |
michael@0 | 995 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 996 | |
michael@0 | 997 | ps += 4; |
michael@0 | 998 | pd += 4; |
michael@0 | 999 | if (pm) |
michael@0 | 1000 | pm += 4; |
michael@0 | 1001 | |
michael@0 | 1002 | w -= 4; |
michael@0 | 1003 | } |
michael@0 | 1004 | |
michael@0 | 1005 | while (w) |
michael@0 | 1006 | { |
michael@0 | 1007 | uint32_t s = combine1 (ps, pm); |
michael@0 | 1008 | uint32_t d = *pd; |
michael@0 | 1009 | |
michael@0 | 1010 | *pd++ = pack_1x128_32 ( |
michael@0 | 1011 | pix_multiply_1x128 ( |
michael@0 | 1012 | unpack_32_1x128 (d), negate_1x128 ( |
michael@0 | 1013 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
michael@0 | 1014 | ps++; |
michael@0 | 1015 | if (pm) |
michael@0 | 1016 | pm++; |
michael@0 | 1017 | w--; |
michael@0 | 1018 | } |
michael@0 | 1019 | } |
michael@0 | 1020 | |
michael@0 | 1021 | static void |
michael@0 | 1022 | sse2_combine_out_u (pixman_implementation_t *imp, |
michael@0 | 1023 | pixman_op_t op, |
michael@0 | 1024 | uint32_t * pd, |
michael@0 | 1025 | const uint32_t * ps, |
michael@0 | 1026 | const uint32_t * pm, |
michael@0 | 1027 | int w) |
michael@0 | 1028 | { |
michael@0 | 1029 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 1030 | { |
michael@0 | 1031 | uint32_t s = combine1 (ps, pm); |
michael@0 | 1032 | uint32_t d = *pd; |
michael@0 | 1033 | |
michael@0 | 1034 | *pd++ = pack_1x128_32 ( |
michael@0 | 1035 | pix_multiply_1x128 ( |
michael@0 | 1036 | unpack_32_1x128 (s), negate_1x128 ( |
michael@0 | 1037 | expand_alpha_1x128 (unpack_32_1x128 (d))))); |
michael@0 | 1038 | w--; |
michael@0 | 1039 | ps++; |
michael@0 | 1040 | if (pm) |
michael@0 | 1041 | pm++; |
michael@0 | 1042 | } |
michael@0 | 1043 | |
michael@0 | 1044 | while (w >= 4) |
michael@0 | 1045 | { |
michael@0 | 1046 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1047 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1048 | |
michael@0 | 1049 | xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm); |
michael@0 | 1050 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 1051 | |
michael@0 | 1052 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1053 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1054 | |
michael@0 | 1055 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1056 | negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1057 | |
michael@0 | 1058 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 1059 | &xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 1060 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1061 | |
michael@0 | 1062 | save_128_aligned ( |
michael@0 | 1063 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1064 | |
michael@0 | 1065 | ps += 4; |
michael@0 | 1066 | pd += 4; |
michael@0 | 1067 | w -= 4; |
michael@0 | 1068 | if (pm) |
michael@0 | 1069 | pm += 4; |
michael@0 | 1070 | } |
michael@0 | 1071 | |
michael@0 | 1072 | while (w) |
michael@0 | 1073 | { |
michael@0 | 1074 | uint32_t s = combine1 (ps, pm); |
michael@0 | 1075 | uint32_t d = *pd; |
michael@0 | 1076 | |
michael@0 | 1077 | *pd++ = pack_1x128_32 ( |
michael@0 | 1078 | pix_multiply_1x128 ( |
michael@0 | 1079 | unpack_32_1x128 (s), negate_1x128 ( |
michael@0 | 1080 | expand_alpha_1x128 (unpack_32_1x128 (d))))); |
michael@0 | 1081 | w--; |
michael@0 | 1082 | ps++; |
michael@0 | 1083 | if (pm) |
michael@0 | 1084 | pm++; |
michael@0 | 1085 | } |
michael@0 | 1086 | } |
michael@0 | 1087 | |
michael@0 | 1088 | static force_inline uint32_t |
michael@0 | 1089 | core_combine_atop_u_pixel_sse2 (uint32_t src, |
michael@0 | 1090 | uint32_t dst) |
michael@0 | 1091 | { |
michael@0 | 1092 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 1093 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 1094 | |
michael@0 | 1095 | __m128i sa = negate_1x128 (expand_alpha_1x128 (s)); |
michael@0 | 1096 | __m128i da = expand_alpha_1x128 (d); |
michael@0 | 1097 | |
michael@0 | 1098 | return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); |
michael@0 | 1099 | } |
michael@0 | 1100 | |
michael@0 | 1101 | static void |
michael@0 | 1102 | sse2_combine_atop_u (pixman_implementation_t *imp, |
michael@0 | 1103 | pixman_op_t op, |
michael@0 | 1104 | uint32_t * pd, |
michael@0 | 1105 | const uint32_t * ps, |
michael@0 | 1106 | const uint32_t * pm, |
michael@0 | 1107 | int w) |
michael@0 | 1108 | { |
michael@0 | 1109 | uint32_t s, d; |
michael@0 | 1110 | |
michael@0 | 1111 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1112 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1113 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
michael@0 | 1114 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
michael@0 | 1115 | |
michael@0 | 1116 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 1117 | { |
michael@0 | 1118 | s = combine1 (ps, pm); |
michael@0 | 1119 | d = *pd; |
michael@0 | 1120 | |
michael@0 | 1121 | *pd++ = core_combine_atop_u_pixel_sse2 (s, d); |
michael@0 | 1122 | w--; |
michael@0 | 1123 | ps++; |
michael@0 | 1124 | if (pm) |
michael@0 | 1125 | pm++; |
michael@0 | 1126 | } |
michael@0 | 1127 | |
michael@0 | 1128 | while (w >= 4) |
michael@0 | 1129 | { |
michael@0 | 1130 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
michael@0 | 1131 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 1132 | |
michael@0 | 1133 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1134 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1135 | |
michael@0 | 1136 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 1137 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 1138 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 1139 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 1140 | |
michael@0 | 1141 | negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, |
michael@0 | 1142 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 1143 | |
michael@0 | 1144 | pix_add_multiply_2x128 ( |
michael@0 | 1145 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
michael@0 | 1146 | &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
michael@0 | 1147 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1148 | |
michael@0 | 1149 | save_128_aligned ( |
michael@0 | 1150 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1151 | |
michael@0 | 1152 | ps += 4; |
michael@0 | 1153 | pd += 4; |
michael@0 | 1154 | w -= 4; |
michael@0 | 1155 | if (pm) |
michael@0 | 1156 | pm += 4; |
michael@0 | 1157 | } |
michael@0 | 1158 | |
michael@0 | 1159 | while (w) |
michael@0 | 1160 | { |
michael@0 | 1161 | s = combine1 (ps, pm); |
michael@0 | 1162 | d = *pd; |
michael@0 | 1163 | |
michael@0 | 1164 | *pd++ = core_combine_atop_u_pixel_sse2 (s, d); |
michael@0 | 1165 | w--; |
michael@0 | 1166 | ps++; |
michael@0 | 1167 | if (pm) |
michael@0 | 1168 | pm++; |
michael@0 | 1169 | } |
michael@0 | 1170 | } |
michael@0 | 1171 | |
michael@0 | 1172 | static force_inline uint32_t |
michael@0 | 1173 | core_combine_reverse_atop_u_pixel_sse2 (uint32_t src, |
michael@0 | 1174 | uint32_t dst) |
michael@0 | 1175 | { |
michael@0 | 1176 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 1177 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 1178 | |
michael@0 | 1179 | __m128i sa = expand_alpha_1x128 (s); |
michael@0 | 1180 | __m128i da = negate_1x128 (expand_alpha_1x128 (d)); |
michael@0 | 1181 | |
michael@0 | 1182 | return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa)); |
michael@0 | 1183 | } |
michael@0 | 1184 | |
michael@0 | 1185 | static void |
michael@0 | 1186 | sse2_combine_atop_reverse_u (pixman_implementation_t *imp, |
michael@0 | 1187 | pixman_op_t op, |
michael@0 | 1188 | uint32_t * pd, |
michael@0 | 1189 | const uint32_t * ps, |
michael@0 | 1190 | const uint32_t * pm, |
michael@0 | 1191 | int w) |
michael@0 | 1192 | { |
michael@0 | 1193 | uint32_t s, d; |
michael@0 | 1194 | |
michael@0 | 1195 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1196 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1197 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
michael@0 | 1198 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
michael@0 | 1199 | |
michael@0 | 1200 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 1201 | { |
michael@0 | 1202 | s = combine1 (ps, pm); |
michael@0 | 1203 | d = *pd; |
michael@0 | 1204 | |
michael@0 | 1205 | *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); |
michael@0 | 1206 | ps++; |
michael@0 | 1207 | w--; |
michael@0 | 1208 | if (pm) |
michael@0 | 1209 | pm++; |
michael@0 | 1210 | } |
michael@0 | 1211 | |
michael@0 | 1212 | while (w >= 4) |
michael@0 | 1213 | { |
michael@0 | 1214 | xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm); |
michael@0 | 1215 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 1216 | |
michael@0 | 1217 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1218 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1219 | |
michael@0 | 1220 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 1221 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 1222 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 1223 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 1224 | |
michael@0 | 1225 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
michael@0 | 1226 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 1227 | |
michael@0 | 1228 | pix_add_multiply_2x128 ( |
michael@0 | 1229 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
michael@0 | 1230 | &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
michael@0 | 1231 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1232 | |
michael@0 | 1233 | save_128_aligned ( |
michael@0 | 1234 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1235 | |
michael@0 | 1236 | ps += 4; |
michael@0 | 1237 | pd += 4; |
michael@0 | 1238 | w -= 4; |
michael@0 | 1239 | if (pm) |
michael@0 | 1240 | pm += 4; |
michael@0 | 1241 | } |
michael@0 | 1242 | |
michael@0 | 1243 | while (w) |
michael@0 | 1244 | { |
michael@0 | 1245 | s = combine1 (ps, pm); |
michael@0 | 1246 | d = *pd; |
michael@0 | 1247 | |
michael@0 | 1248 | *pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d); |
michael@0 | 1249 | ps++; |
michael@0 | 1250 | w--; |
michael@0 | 1251 | if (pm) |
michael@0 | 1252 | pm++; |
michael@0 | 1253 | } |
michael@0 | 1254 | } |
michael@0 | 1255 | |
michael@0 | 1256 | static force_inline uint32_t |
michael@0 | 1257 | core_combine_xor_u_pixel_sse2 (uint32_t src, |
michael@0 | 1258 | uint32_t dst) |
michael@0 | 1259 | { |
michael@0 | 1260 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 1261 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 1262 | |
michael@0 | 1263 | __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d)); |
michael@0 | 1264 | __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s)); |
michael@0 | 1265 | |
michael@0 | 1266 | return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s)); |
michael@0 | 1267 | } |
michael@0 | 1268 | |
michael@0 | 1269 | static void |
michael@0 | 1270 | sse2_combine_xor_u (pixman_implementation_t *imp, |
michael@0 | 1271 | pixman_op_t op, |
michael@0 | 1272 | uint32_t * dst, |
michael@0 | 1273 | const uint32_t * src, |
michael@0 | 1274 | const uint32_t * mask, |
michael@0 | 1275 | int width) |
michael@0 | 1276 | { |
michael@0 | 1277 | int w = width; |
michael@0 | 1278 | uint32_t s, d; |
michael@0 | 1279 | uint32_t* pd = dst; |
michael@0 | 1280 | const uint32_t* ps = src; |
michael@0 | 1281 | const uint32_t* pm = mask; |
michael@0 | 1282 | |
michael@0 | 1283 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 1284 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1285 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
michael@0 | 1286 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
michael@0 | 1287 | |
michael@0 | 1288 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 1289 | { |
michael@0 | 1290 | s = combine1 (ps, pm); |
michael@0 | 1291 | d = *pd; |
michael@0 | 1292 | |
michael@0 | 1293 | *pd++ = core_combine_xor_u_pixel_sse2 (s, d); |
michael@0 | 1294 | w--; |
michael@0 | 1295 | ps++; |
michael@0 | 1296 | if (pm) |
michael@0 | 1297 | pm++; |
michael@0 | 1298 | } |
michael@0 | 1299 | |
michael@0 | 1300 | while (w >= 4) |
michael@0 | 1301 | { |
michael@0 | 1302 | xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm); |
michael@0 | 1303 | xmm_dst = load_128_aligned ((__m128i*) pd); |
michael@0 | 1304 | |
michael@0 | 1305 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1306 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1307 | |
michael@0 | 1308 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 1309 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 1310 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 1311 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 1312 | |
michael@0 | 1313 | negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi, |
michael@0 | 1314 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 1315 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
michael@0 | 1316 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 1317 | |
michael@0 | 1318 | pix_add_multiply_2x128 ( |
michael@0 | 1319 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
michael@0 | 1320 | &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
michael@0 | 1321 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1322 | |
michael@0 | 1323 | save_128_aligned ( |
michael@0 | 1324 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1325 | |
michael@0 | 1326 | ps += 4; |
michael@0 | 1327 | pd += 4; |
michael@0 | 1328 | w -= 4; |
michael@0 | 1329 | if (pm) |
michael@0 | 1330 | pm += 4; |
michael@0 | 1331 | } |
michael@0 | 1332 | |
michael@0 | 1333 | while (w) |
michael@0 | 1334 | { |
michael@0 | 1335 | s = combine1 (ps, pm); |
michael@0 | 1336 | d = *pd; |
michael@0 | 1337 | |
michael@0 | 1338 | *pd++ = core_combine_xor_u_pixel_sse2 (s, d); |
michael@0 | 1339 | w--; |
michael@0 | 1340 | ps++; |
michael@0 | 1341 | if (pm) |
michael@0 | 1342 | pm++; |
michael@0 | 1343 | } |
michael@0 | 1344 | } |
michael@0 | 1345 | |
michael@0 | 1346 | static force_inline void |
michael@0 | 1347 | sse2_combine_add_u (pixman_implementation_t *imp, |
michael@0 | 1348 | pixman_op_t op, |
michael@0 | 1349 | uint32_t * dst, |
michael@0 | 1350 | const uint32_t * src, |
michael@0 | 1351 | const uint32_t * mask, |
michael@0 | 1352 | int width) |
michael@0 | 1353 | { |
michael@0 | 1354 | int w = width; |
michael@0 | 1355 | uint32_t s, d; |
michael@0 | 1356 | uint32_t* pd = dst; |
michael@0 | 1357 | const uint32_t* ps = src; |
michael@0 | 1358 | const uint32_t* pm = mask; |
michael@0 | 1359 | |
michael@0 | 1360 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1361 | { |
michael@0 | 1362 | s = combine1 (ps, pm); |
michael@0 | 1363 | d = *pd; |
michael@0 | 1364 | |
michael@0 | 1365 | ps++; |
michael@0 | 1366 | if (pm) |
michael@0 | 1367 | pm++; |
michael@0 | 1368 | *pd++ = _mm_cvtsi128_si32 ( |
michael@0 | 1369 | _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); |
michael@0 | 1370 | w--; |
michael@0 | 1371 | } |
michael@0 | 1372 | |
michael@0 | 1373 | while (w >= 4) |
michael@0 | 1374 | { |
michael@0 | 1375 | __m128i s; |
michael@0 | 1376 | |
michael@0 | 1377 | s = combine4 ((__m128i*)ps, (__m128i*)pm); |
michael@0 | 1378 | |
michael@0 | 1379 | save_128_aligned ( |
michael@0 | 1380 | (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i*)pd))); |
michael@0 | 1381 | |
michael@0 | 1382 | pd += 4; |
michael@0 | 1383 | ps += 4; |
michael@0 | 1384 | if (pm) |
michael@0 | 1385 | pm += 4; |
michael@0 | 1386 | w -= 4; |
michael@0 | 1387 | } |
michael@0 | 1388 | |
michael@0 | 1389 | while (w--) |
michael@0 | 1390 | { |
michael@0 | 1391 | s = combine1 (ps, pm); |
michael@0 | 1392 | d = *pd; |
michael@0 | 1393 | |
michael@0 | 1394 | ps++; |
michael@0 | 1395 | *pd++ = _mm_cvtsi128_si32 ( |
michael@0 | 1396 | _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d))); |
michael@0 | 1397 | if (pm) |
michael@0 | 1398 | pm++; |
michael@0 | 1399 | } |
michael@0 | 1400 | } |
michael@0 | 1401 | |
michael@0 | 1402 | static force_inline uint32_t |
michael@0 | 1403 | core_combine_saturate_u_pixel_sse2 (uint32_t src, |
michael@0 | 1404 | uint32_t dst) |
michael@0 | 1405 | { |
michael@0 | 1406 | __m128i ms = unpack_32_1x128 (src); |
michael@0 | 1407 | __m128i md = unpack_32_1x128 (dst); |
michael@0 | 1408 | uint32_t sa = src >> 24; |
michael@0 | 1409 | uint32_t da = ~dst >> 24; |
michael@0 | 1410 | |
michael@0 | 1411 | if (sa > da) |
michael@0 | 1412 | { |
michael@0 | 1413 | ms = pix_multiply_1x128 ( |
michael@0 | 1414 | ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24))); |
michael@0 | 1415 | } |
michael@0 | 1416 | |
michael@0 | 1417 | return pack_1x128_32 (_mm_adds_epu16 (md, ms)); |
michael@0 | 1418 | } |
michael@0 | 1419 | |
michael@0 | 1420 | static void |
michael@0 | 1421 | sse2_combine_saturate_u (pixman_implementation_t *imp, |
michael@0 | 1422 | pixman_op_t op, |
michael@0 | 1423 | uint32_t * pd, |
michael@0 | 1424 | const uint32_t * ps, |
michael@0 | 1425 | const uint32_t * pm, |
michael@0 | 1426 | int w) |
michael@0 | 1427 | { |
michael@0 | 1428 | uint32_t s, d; |
michael@0 | 1429 | |
michael@0 | 1430 | uint32_t pack_cmp; |
michael@0 | 1431 | __m128i xmm_src, xmm_dst; |
michael@0 | 1432 | |
michael@0 | 1433 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1434 | { |
michael@0 | 1435 | s = combine1 (ps, pm); |
michael@0 | 1436 | d = *pd; |
michael@0 | 1437 | |
michael@0 | 1438 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
michael@0 | 1439 | w--; |
michael@0 | 1440 | ps++; |
michael@0 | 1441 | if (pm) |
michael@0 | 1442 | pm++; |
michael@0 | 1443 | } |
michael@0 | 1444 | |
michael@0 | 1445 | while (w >= 4) |
michael@0 | 1446 | { |
michael@0 | 1447 | xmm_dst = load_128_aligned ((__m128i*)pd); |
michael@0 | 1448 | xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm); |
michael@0 | 1449 | |
michael@0 | 1450 | pack_cmp = _mm_movemask_epi8 ( |
michael@0 | 1451 | _mm_cmpgt_epi32 ( |
michael@0 | 1452 | _mm_srli_epi32 (xmm_src, 24), |
michael@0 | 1453 | _mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24))); |
michael@0 | 1454 | |
michael@0 | 1455 | /* if some alpha src is grater than respective ~alpha dst */ |
michael@0 | 1456 | if (pack_cmp) |
michael@0 | 1457 | { |
michael@0 | 1458 | s = combine1 (ps++, pm); |
michael@0 | 1459 | d = *pd; |
michael@0 | 1460 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
michael@0 | 1461 | if (pm) |
michael@0 | 1462 | pm++; |
michael@0 | 1463 | |
michael@0 | 1464 | s = combine1 (ps++, pm); |
michael@0 | 1465 | d = *pd; |
michael@0 | 1466 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
michael@0 | 1467 | if (pm) |
michael@0 | 1468 | pm++; |
michael@0 | 1469 | |
michael@0 | 1470 | s = combine1 (ps++, pm); |
michael@0 | 1471 | d = *pd; |
michael@0 | 1472 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
michael@0 | 1473 | if (pm) |
michael@0 | 1474 | pm++; |
michael@0 | 1475 | |
michael@0 | 1476 | s = combine1 (ps++, pm); |
michael@0 | 1477 | d = *pd; |
michael@0 | 1478 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
michael@0 | 1479 | if (pm) |
michael@0 | 1480 | pm++; |
michael@0 | 1481 | } |
michael@0 | 1482 | else |
michael@0 | 1483 | { |
michael@0 | 1484 | save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src)); |
michael@0 | 1485 | |
michael@0 | 1486 | pd += 4; |
michael@0 | 1487 | ps += 4; |
michael@0 | 1488 | if (pm) |
michael@0 | 1489 | pm += 4; |
michael@0 | 1490 | } |
michael@0 | 1491 | |
michael@0 | 1492 | w -= 4; |
michael@0 | 1493 | } |
michael@0 | 1494 | |
michael@0 | 1495 | while (w--) |
michael@0 | 1496 | { |
michael@0 | 1497 | s = combine1 (ps, pm); |
michael@0 | 1498 | d = *pd; |
michael@0 | 1499 | |
michael@0 | 1500 | *pd++ = core_combine_saturate_u_pixel_sse2 (s, d); |
michael@0 | 1501 | ps++; |
michael@0 | 1502 | if (pm) |
michael@0 | 1503 | pm++; |
michael@0 | 1504 | } |
michael@0 | 1505 | } |
michael@0 | 1506 | |
michael@0 | 1507 | static void |
michael@0 | 1508 | sse2_combine_src_ca (pixman_implementation_t *imp, |
michael@0 | 1509 | pixman_op_t op, |
michael@0 | 1510 | uint32_t * pd, |
michael@0 | 1511 | const uint32_t * ps, |
michael@0 | 1512 | const uint32_t * pm, |
michael@0 | 1513 | int w) |
michael@0 | 1514 | { |
michael@0 | 1515 | uint32_t s, m; |
michael@0 | 1516 | |
michael@0 | 1517 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1518 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1519 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1520 | |
michael@0 | 1521 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1522 | { |
michael@0 | 1523 | s = *ps++; |
michael@0 | 1524 | m = *pm++; |
michael@0 | 1525 | *pd++ = pack_1x128_32 ( |
michael@0 | 1526 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); |
michael@0 | 1527 | w--; |
michael@0 | 1528 | } |
michael@0 | 1529 | |
michael@0 | 1530 | while (w >= 4) |
michael@0 | 1531 | { |
michael@0 | 1532 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1533 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1534 | |
michael@0 | 1535 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1536 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1537 | |
michael@0 | 1538 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 1539 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1540 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1541 | |
michael@0 | 1542 | save_128_aligned ( |
michael@0 | 1543 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1544 | |
michael@0 | 1545 | ps += 4; |
michael@0 | 1546 | pd += 4; |
michael@0 | 1547 | pm += 4; |
michael@0 | 1548 | w -= 4; |
michael@0 | 1549 | } |
michael@0 | 1550 | |
michael@0 | 1551 | while (w) |
michael@0 | 1552 | { |
michael@0 | 1553 | s = *ps++; |
michael@0 | 1554 | m = *pm++; |
michael@0 | 1555 | *pd++ = pack_1x128_32 ( |
michael@0 | 1556 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m))); |
michael@0 | 1557 | w--; |
michael@0 | 1558 | } |
michael@0 | 1559 | } |
michael@0 | 1560 | |
michael@0 | 1561 | static force_inline uint32_t |
michael@0 | 1562 | core_combine_over_ca_pixel_sse2 (uint32_t src, |
michael@0 | 1563 | uint32_t mask, |
michael@0 | 1564 | uint32_t dst) |
michael@0 | 1565 | { |
michael@0 | 1566 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 1567 | __m128i expAlpha = expand_alpha_1x128 (s); |
michael@0 | 1568 | __m128i unpk_mask = unpack_32_1x128 (mask); |
michael@0 | 1569 | __m128i unpk_dst = unpack_32_1x128 (dst); |
michael@0 | 1570 | |
michael@0 | 1571 | return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst)); |
michael@0 | 1572 | } |
michael@0 | 1573 | |
michael@0 | 1574 | static void |
michael@0 | 1575 | sse2_combine_over_ca (pixman_implementation_t *imp, |
michael@0 | 1576 | pixman_op_t op, |
michael@0 | 1577 | uint32_t * pd, |
michael@0 | 1578 | const uint32_t * ps, |
michael@0 | 1579 | const uint32_t * pm, |
michael@0 | 1580 | int w) |
michael@0 | 1581 | { |
michael@0 | 1582 | uint32_t s, m, d; |
michael@0 | 1583 | |
michael@0 | 1584 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 1585 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1586 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1587 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1588 | |
michael@0 | 1589 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1590 | { |
michael@0 | 1591 | s = *ps++; |
michael@0 | 1592 | m = *pm++; |
michael@0 | 1593 | d = *pd; |
michael@0 | 1594 | |
michael@0 | 1595 | *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); |
michael@0 | 1596 | w--; |
michael@0 | 1597 | } |
michael@0 | 1598 | |
michael@0 | 1599 | while (w >= 4) |
michael@0 | 1600 | { |
michael@0 | 1601 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 1602 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1603 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1604 | |
michael@0 | 1605 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1606 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1607 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1608 | |
michael@0 | 1609 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 1610 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1611 | |
michael@0 | 1612 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 1613 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1614 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1615 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1616 | |
michael@0 | 1617 | save_128_aligned ( |
michael@0 | 1618 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1619 | |
michael@0 | 1620 | ps += 4; |
michael@0 | 1621 | pd += 4; |
michael@0 | 1622 | pm += 4; |
michael@0 | 1623 | w -= 4; |
michael@0 | 1624 | } |
michael@0 | 1625 | |
michael@0 | 1626 | while (w) |
michael@0 | 1627 | { |
michael@0 | 1628 | s = *ps++; |
michael@0 | 1629 | m = *pm++; |
michael@0 | 1630 | d = *pd; |
michael@0 | 1631 | |
michael@0 | 1632 | *pd++ = core_combine_over_ca_pixel_sse2 (s, m, d); |
michael@0 | 1633 | w--; |
michael@0 | 1634 | } |
michael@0 | 1635 | } |
michael@0 | 1636 | |
michael@0 | 1637 | static force_inline uint32_t |
michael@0 | 1638 | core_combine_over_reverse_ca_pixel_sse2 (uint32_t src, |
michael@0 | 1639 | uint32_t mask, |
michael@0 | 1640 | uint32_t dst) |
michael@0 | 1641 | { |
michael@0 | 1642 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 1643 | |
michael@0 | 1644 | return pack_1x128_32 ( |
michael@0 | 1645 | over_1x128 (d, expand_alpha_1x128 (d), |
michael@0 | 1646 | pix_multiply_1x128 (unpack_32_1x128 (src), |
michael@0 | 1647 | unpack_32_1x128 (mask)))); |
michael@0 | 1648 | } |
michael@0 | 1649 | |
michael@0 | 1650 | static void |
michael@0 | 1651 | sse2_combine_over_reverse_ca (pixman_implementation_t *imp, |
michael@0 | 1652 | pixman_op_t op, |
michael@0 | 1653 | uint32_t * pd, |
michael@0 | 1654 | const uint32_t * ps, |
michael@0 | 1655 | const uint32_t * pm, |
michael@0 | 1656 | int w) |
michael@0 | 1657 | { |
michael@0 | 1658 | uint32_t s, m, d; |
michael@0 | 1659 | |
michael@0 | 1660 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 1661 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1662 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1663 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1664 | |
michael@0 | 1665 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1666 | { |
michael@0 | 1667 | s = *ps++; |
michael@0 | 1668 | m = *pm++; |
michael@0 | 1669 | d = *pd; |
michael@0 | 1670 | |
michael@0 | 1671 | *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); |
michael@0 | 1672 | w--; |
michael@0 | 1673 | } |
michael@0 | 1674 | |
michael@0 | 1675 | while (w >= 4) |
michael@0 | 1676 | { |
michael@0 | 1677 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 1678 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1679 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1680 | |
michael@0 | 1681 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1682 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1683 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1684 | |
michael@0 | 1685 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 1686 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1687 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 1688 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1689 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1690 | |
michael@0 | 1691 | over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 1692 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1693 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1694 | |
michael@0 | 1695 | save_128_aligned ( |
michael@0 | 1696 | (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); |
michael@0 | 1697 | |
michael@0 | 1698 | ps += 4; |
michael@0 | 1699 | pd += 4; |
michael@0 | 1700 | pm += 4; |
michael@0 | 1701 | w -= 4; |
michael@0 | 1702 | } |
michael@0 | 1703 | |
michael@0 | 1704 | while (w) |
michael@0 | 1705 | { |
michael@0 | 1706 | s = *ps++; |
michael@0 | 1707 | m = *pm++; |
michael@0 | 1708 | d = *pd; |
michael@0 | 1709 | |
michael@0 | 1710 | *pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d); |
michael@0 | 1711 | w--; |
michael@0 | 1712 | } |
michael@0 | 1713 | } |
michael@0 | 1714 | |
michael@0 | 1715 | static void |
michael@0 | 1716 | sse2_combine_in_ca (pixman_implementation_t *imp, |
michael@0 | 1717 | pixman_op_t op, |
michael@0 | 1718 | uint32_t * pd, |
michael@0 | 1719 | const uint32_t * ps, |
michael@0 | 1720 | const uint32_t * pm, |
michael@0 | 1721 | int w) |
michael@0 | 1722 | { |
michael@0 | 1723 | uint32_t s, m, d; |
michael@0 | 1724 | |
michael@0 | 1725 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 1726 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1727 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1728 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1729 | |
michael@0 | 1730 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1731 | { |
michael@0 | 1732 | s = *ps++; |
michael@0 | 1733 | m = *pm++; |
michael@0 | 1734 | d = *pd; |
michael@0 | 1735 | |
michael@0 | 1736 | *pd++ = pack_1x128_32 ( |
michael@0 | 1737 | pix_multiply_1x128 ( |
michael@0 | 1738 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)), |
michael@0 | 1739 | expand_alpha_1x128 (unpack_32_1x128 (d)))); |
michael@0 | 1740 | |
michael@0 | 1741 | w--; |
michael@0 | 1742 | } |
michael@0 | 1743 | |
michael@0 | 1744 | while (w >= 4) |
michael@0 | 1745 | { |
michael@0 | 1746 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 1747 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1748 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1749 | |
michael@0 | 1750 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1751 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1752 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1753 | |
michael@0 | 1754 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 1755 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1756 | |
michael@0 | 1757 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 1758 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1759 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1760 | |
michael@0 | 1761 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 1762 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1763 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1764 | |
michael@0 | 1765 | save_128_aligned ( |
michael@0 | 1766 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1767 | |
michael@0 | 1768 | ps += 4; |
michael@0 | 1769 | pd += 4; |
michael@0 | 1770 | pm += 4; |
michael@0 | 1771 | w -= 4; |
michael@0 | 1772 | } |
michael@0 | 1773 | |
michael@0 | 1774 | while (w) |
michael@0 | 1775 | { |
michael@0 | 1776 | s = *ps++; |
michael@0 | 1777 | m = *pm++; |
michael@0 | 1778 | d = *pd; |
michael@0 | 1779 | |
michael@0 | 1780 | *pd++ = pack_1x128_32 ( |
michael@0 | 1781 | pix_multiply_1x128 ( |
michael@0 | 1782 | pix_multiply_1x128 ( |
michael@0 | 1783 | unpack_32_1x128 (s), unpack_32_1x128 (m)), |
michael@0 | 1784 | expand_alpha_1x128 (unpack_32_1x128 (d)))); |
michael@0 | 1785 | |
michael@0 | 1786 | w--; |
michael@0 | 1787 | } |
michael@0 | 1788 | } |
michael@0 | 1789 | |
michael@0 | 1790 | static void |
michael@0 | 1791 | sse2_combine_in_reverse_ca (pixman_implementation_t *imp, |
michael@0 | 1792 | pixman_op_t op, |
michael@0 | 1793 | uint32_t * pd, |
michael@0 | 1794 | const uint32_t * ps, |
michael@0 | 1795 | const uint32_t * pm, |
michael@0 | 1796 | int w) |
michael@0 | 1797 | { |
michael@0 | 1798 | uint32_t s, m, d; |
michael@0 | 1799 | |
michael@0 | 1800 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 1801 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1802 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1803 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1804 | |
michael@0 | 1805 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1806 | { |
michael@0 | 1807 | s = *ps++; |
michael@0 | 1808 | m = *pm++; |
michael@0 | 1809 | d = *pd; |
michael@0 | 1810 | |
michael@0 | 1811 | *pd++ = pack_1x128_32 ( |
michael@0 | 1812 | pix_multiply_1x128 ( |
michael@0 | 1813 | unpack_32_1x128 (d), |
michael@0 | 1814 | pix_multiply_1x128 (unpack_32_1x128 (m), |
michael@0 | 1815 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
michael@0 | 1816 | w--; |
michael@0 | 1817 | } |
michael@0 | 1818 | |
michael@0 | 1819 | while (w >= 4) |
michael@0 | 1820 | { |
michael@0 | 1821 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 1822 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1823 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1824 | |
michael@0 | 1825 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1826 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1827 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1828 | |
michael@0 | 1829 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 1830 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1831 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1832 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1833 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1834 | |
michael@0 | 1835 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 1836 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1837 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1838 | |
michael@0 | 1839 | save_128_aligned ( |
michael@0 | 1840 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1841 | |
michael@0 | 1842 | ps += 4; |
michael@0 | 1843 | pd += 4; |
michael@0 | 1844 | pm += 4; |
michael@0 | 1845 | w -= 4; |
michael@0 | 1846 | } |
michael@0 | 1847 | |
michael@0 | 1848 | while (w) |
michael@0 | 1849 | { |
michael@0 | 1850 | s = *ps++; |
michael@0 | 1851 | m = *pm++; |
michael@0 | 1852 | d = *pd; |
michael@0 | 1853 | |
michael@0 | 1854 | *pd++ = pack_1x128_32 ( |
michael@0 | 1855 | pix_multiply_1x128 ( |
michael@0 | 1856 | unpack_32_1x128 (d), |
michael@0 | 1857 | pix_multiply_1x128 (unpack_32_1x128 (m), |
michael@0 | 1858 | expand_alpha_1x128 (unpack_32_1x128 (s))))); |
michael@0 | 1859 | w--; |
michael@0 | 1860 | } |
michael@0 | 1861 | } |
michael@0 | 1862 | |
michael@0 | 1863 | static void |
michael@0 | 1864 | sse2_combine_out_ca (pixman_implementation_t *imp, |
michael@0 | 1865 | pixman_op_t op, |
michael@0 | 1866 | uint32_t * pd, |
michael@0 | 1867 | const uint32_t * ps, |
michael@0 | 1868 | const uint32_t * pm, |
michael@0 | 1869 | int w) |
michael@0 | 1870 | { |
michael@0 | 1871 | uint32_t s, m, d; |
michael@0 | 1872 | |
michael@0 | 1873 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 1874 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1875 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1876 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1877 | |
michael@0 | 1878 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1879 | { |
michael@0 | 1880 | s = *ps++; |
michael@0 | 1881 | m = *pm++; |
michael@0 | 1882 | d = *pd; |
michael@0 | 1883 | |
michael@0 | 1884 | *pd++ = pack_1x128_32 ( |
michael@0 | 1885 | pix_multiply_1x128 ( |
michael@0 | 1886 | pix_multiply_1x128 ( |
michael@0 | 1887 | unpack_32_1x128 (s), unpack_32_1x128 (m)), |
michael@0 | 1888 | negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); |
michael@0 | 1889 | w--; |
michael@0 | 1890 | } |
michael@0 | 1891 | |
michael@0 | 1892 | while (w >= 4) |
michael@0 | 1893 | { |
michael@0 | 1894 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 1895 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1896 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1897 | |
michael@0 | 1898 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1899 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1900 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1901 | |
michael@0 | 1902 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 1903 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1904 | negate_2x128 (xmm_alpha_lo, xmm_alpha_hi, |
michael@0 | 1905 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1906 | |
michael@0 | 1907 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 1908 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1909 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1910 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 1911 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1912 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1913 | |
michael@0 | 1914 | save_128_aligned ( |
michael@0 | 1915 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1916 | |
michael@0 | 1917 | ps += 4; |
michael@0 | 1918 | pd += 4; |
michael@0 | 1919 | pm += 4; |
michael@0 | 1920 | w -= 4; |
michael@0 | 1921 | } |
michael@0 | 1922 | |
michael@0 | 1923 | while (w) |
michael@0 | 1924 | { |
michael@0 | 1925 | s = *ps++; |
michael@0 | 1926 | m = *pm++; |
michael@0 | 1927 | d = *pd; |
michael@0 | 1928 | |
michael@0 | 1929 | *pd++ = pack_1x128_32 ( |
michael@0 | 1930 | pix_multiply_1x128 ( |
michael@0 | 1931 | pix_multiply_1x128 ( |
michael@0 | 1932 | unpack_32_1x128 (s), unpack_32_1x128 (m)), |
michael@0 | 1933 | negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d))))); |
michael@0 | 1934 | |
michael@0 | 1935 | w--; |
michael@0 | 1936 | } |
michael@0 | 1937 | } |
michael@0 | 1938 | |
michael@0 | 1939 | static void |
michael@0 | 1940 | sse2_combine_out_reverse_ca (pixman_implementation_t *imp, |
michael@0 | 1941 | pixman_op_t op, |
michael@0 | 1942 | uint32_t * pd, |
michael@0 | 1943 | const uint32_t * ps, |
michael@0 | 1944 | const uint32_t * pm, |
michael@0 | 1945 | int w) |
michael@0 | 1946 | { |
michael@0 | 1947 | uint32_t s, m, d; |
michael@0 | 1948 | |
michael@0 | 1949 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 1950 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 1951 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 1952 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 1953 | |
michael@0 | 1954 | while (w && (uintptr_t)pd & 15) |
michael@0 | 1955 | { |
michael@0 | 1956 | s = *ps++; |
michael@0 | 1957 | m = *pm++; |
michael@0 | 1958 | d = *pd; |
michael@0 | 1959 | |
michael@0 | 1960 | *pd++ = pack_1x128_32 ( |
michael@0 | 1961 | pix_multiply_1x128 ( |
michael@0 | 1962 | unpack_32_1x128 (d), |
michael@0 | 1963 | negate_1x128 (pix_multiply_1x128 ( |
michael@0 | 1964 | unpack_32_1x128 (m), |
michael@0 | 1965 | expand_alpha_1x128 (unpack_32_1x128 (s)))))); |
michael@0 | 1966 | w--; |
michael@0 | 1967 | } |
michael@0 | 1968 | |
michael@0 | 1969 | while (w >= 4) |
michael@0 | 1970 | { |
michael@0 | 1971 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 1972 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 1973 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 1974 | |
michael@0 | 1975 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1976 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 1977 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1978 | |
michael@0 | 1979 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 1980 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 1981 | |
michael@0 | 1982 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1983 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 1984 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1985 | |
michael@0 | 1986 | negate_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 1987 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 1988 | |
michael@0 | 1989 | pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 1990 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 1991 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 1992 | |
michael@0 | 1993 | save_128_aligned ( |
michael@0 | 1994 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 1995 | |
michael@0 | 1996 | ps += 4; |
michael@0 | 1997 | pd += 4; |
michael@0 | 1998 | pm += 4; |
michael@0 | 1999 | w -= 4; |
michael@0 | 2000 | } |
michael@0 | 2001 | |
michael@0 | 2002 | while (w) |
michael@0 | 2003 | { |
michael@0 | 2004 | s = *ps++; |
michael@0 | 2005 | m = *pm++; |
michael@0 | 2006 | d = *pd; |
michael@0 | 2007 | |
michael@0 | 2008 | *pd++ = pack_1x128_32 ( |
michael@0 | 2009 | pix_multiply_1x128 ( |
michael@0 | 2010 | unpack_32_1x128 (d), |
michael@0 | 2011 | negate_1x128 (pix_multiply_1x128 ( |
michael@0 | 2012 | unpack_32_1x128 (m), |
michael@0 | 2013 | expand_alpha_1x128 (unpack_32_1x128 (s)))))); |
michael@0 | 2014 | w--; |
michael@0 | 2015 | } |
michael@0 | 2016 | } |
michael@0 | 2017 | |
michael@0 | 2018 | static force_inline uint32_t |
michael@0 | 2019 | core_combine_atop_ca_pixel_sse2 (uint32_t src, |
michael@0 | 2020 | uint32_t mask, |
michael@0 | 2021 | uint32_t dst) |
michael@0 | 2022 | { |
michael@0 | 2023 | __m128i m = unpack_32_1x128 (mask); |
michael@0 | 2024 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 2025 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 2026 | __m128i sa = expand_alpha_1x128 (s); |
michael@0 | 2027 | __m128i da = expand_alpha_1x128 (d); |
michael@0 | 2028 | |
michael@0 | 2029 | s = pix_multiply_1x128 (s, m); |
michael@0 | 2030 | m = negate_1x128 (pix_multiply_1x128 (m, sa)); |
michael@0 | 2031 | |
michael@0 | 2032 | return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); |
michael@0 | 2033 | } |
michael@0 | 2034 | |
michael@0 | 2035 | static void |
michael@0 | 2036 | sse2_combine_atop_ca (pixman_implementation_t *imp, |
michael@0 | 2037 | pixman_op_t op, |
michael@0 | 2038 | uint32_t * pd, |
michael@0 | 2039 | const uint32_t * ps, |
michael@0 | 2040 | const uint32_t * pm, |
michael@0 | 2041 | int w) |
michael@0 | 2042 | { |
michael@0 | 2043 | uint32_t s, m, d; |
michael@0 | 2044 | |
michael@0 | 2045 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 2046 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2047 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
michael@0 | 2048 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
michael@0 | 2049 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 2050 | |
michael@0 | 2051 | while (w && (uintptr_t)pd & 15) |
michael@0 | 2052 | { |
michael@0 | 2053 | s = *ps++; |
michael@0 | 2054 | m = *pm++; |
michael@0 | 2055 | d = *pd; |
michael@0 | 2056 | |
michael@0 | 2057 | *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); |
michael@0 | 2058 | w--; |
michael@0 | 2059 | } |
michael@0 | 2060 | |
michael@0 | 2061 | while (w >= 4) |
michael@0 | 2062 | { |
michael@0 | 2063 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 2064 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 2065 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 2066 | |
michael@0 | 2067 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2068 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2069 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2070 | |
michael@0 | 2071 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 2072 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 2073 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 2074 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 2075 | |
michael@0 | 2076 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 2077 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2078 | &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2079 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2080 | &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
michael@0 | 2081 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2082 | |
michael@0 | 2083 | negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2084 | |
michael@0 | 2085 | pix_add_multiply_2x128 ( |
michael@0 | 2086 | &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2087 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
michael@0 | 2088 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2089 | |
michael@0 | 2090 | save_128_aligned ( |
michael@0 | 2091 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 2092 | |
michael@0 | 2093 | ps += 4; |
michael@0 | 2094 | pd += 4; |
michael@0 | 2095 | pm += 4; |
michael@0 | 2096 | w -= 4; |
michael@0 | 2097 | } |
michael@0 | 2098 | |
michael@0 | 2099 | while (w) |
michael@0 | 2100 | { |
michael@0 | 2101 | s = *ps++; |
michael@0 | 2102 | m = *pm++; |
michael@0 | 2103 | d = *pd; |
michael@0 | 2104 | |
michael@0 | 2105 | *pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d); |
michael@0 | 2106 | w--; |
michael@0 | 2107 | } |
michael@0 | 2108 | } |
michael@0 | 2109 | |
michael@0 | 2110 | static force_inline uint32_t |
michael@0 | 2111 | core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src, |
michael@0 | 2112 | uint32_t mask, |
michael@0 | 2113 | uint32_t dst) |
michael@0 | 2114 | { |
michael@0 | 2115 | __m128i m = unpack_32_1x128 (mask); |
michael@0 | 2116 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 2117 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 2118 | |
michael@0 | 2119 | __m128i da = negate_1x128 (expand_alpha_1x128 (d)); |
michael@0 | 2120 | __m128i sa = expand_alpha_1x128 (s); |
michael@0 | 2121 | |
michael@0 | 2122 | s = pix_multiply_1x128 (s, m); |
michael@0 | 2123 | m = pix_multiply_1x128 (m, sa); |
michael@0 | 2124 | |
michael@0 | 2125 | return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da)); |
michael@0 | 2126 | } |
michael@0 | 2127 | |
michael@0 | 2128 | static void |
michael@0 | 2129 | sse2_combine_atop_reverse_ca (pixman_implementation_t *imp, |
michael@0 | 2130 | pixman_op_t op, |
michael@0 | 2131 | uint32_t * pd, |
michael@0 | 2132 | const uint32_t * ps, |
michael@0 | 2133 | const uint32_t * pm, |
michael@0 | 2134 | int w) |
michael@0 | 2135 | { |
michael@0 | 2136 | uint32_t s, m, d; |
michael@0 | 2137 | |
michael@0 | 2138 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 2139 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2140 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
michael@0 | 2141 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
michael@0 | 2142 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 2143 | |
michael@0 | 2144 | while (w && (uintptr_t)pd & 15) |
michael@0 | 2145 | { |
michael@0 | 2146 | s = *ps++; |
michael@0 | 2147 | m = *pm++; |
michael@0 | 2148 | d = *pd; |
michael@0 | 2149 | |
michael@0 | 2150 | *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); |
michael@0 | 2151 | w--; |
michael@0 | 2152 | } |
michael@0 | 2153 | |
michael@0 | 2154 | while (w >= 4) |
michael@0 | 2155 | { |
michael@0 | 2156 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 2157 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 2158 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 2159 | |
michael@0 | 2160 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2161 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2162 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2163 | |
michael@0 | 2164 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 2165 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 2166 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 2167 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 2168 | |
michael@0 | 2169 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 2170 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2171 | &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2172 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2173 | &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
michael@0 | 2174 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2175 | |
michael@0 | 2176 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
michael@0 | 2177 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 2178 | |
michael@0 | 2179 | pix_add_multiply_2x128 ( |
michael@0 | 2180 | &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2181 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
michael@0 | 2182 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2183 | |
michael@0 | 2184 | save_128_aligned ( |
michael@0 | 2185 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 2186 | |
michael@0 | 2187 | ps += 4; |
michael@0 | 2188 | pd += 4; |
michael@0 | 2189 | pm += 4; |
michael@0 | 2190 | w -= 4; |
michael@0 | 2191 | } |
michael@0 | 2192 | |
michael@0 | 2193 | while (w) |
michael@0 | 2194 | { |
michael@0 | 2195 | s = *ps++; |
michael@0 | 2196 | m = *pm++; |
michael@0 | 2197 | d = *pd; |
michael@0 | 2198 | |
michael@0 | 2199 | *pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d); |
michael@0 | 2200 | w--; |
michael@0 | 2201 | } |
michael@0 | 2202 | } |
michael@0 | 2203 | |
michael@0 | 2204 | static force_inline uint32_t |
michael@0 | 2205 | core_combine_xor_ca_pixel_sse2 (uint32_t src, |
michael@0 | 2206 | uint32_t mask, |
michael@0 | 2207 | uint32_t dst) |
michael@0 | 2208 | { |
michael@0 | 2209 | __m128i a = unpack_32_1x128 (mask); |
michael@0 | 2210 | __m128i s = unpack_32_1x128 (src); |
michael@0 | 2211 | __m128i d = unpack_32_1x128 (dst); |
michael@0 | 2212 | |
michael@0 | 2213 | __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 ( |
michael@0 | 2214 | a, expand_alpha_1x128 (s))); |
michael@0 | 2215 | __m128i dest = pix_multiply_1x128 (s, a); |
michael@0 | 2216 | __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d)); |
michael@0 | 2217 | |
michael@0 | 2218 | return pack_1x128_32 (pix_add_multiply_1x128 (&d, |
michael@0 | 2219 | &alpha_dst, |
michael@0 | 2220 | &dest, |
michael@0 | 2221 | &alpha_src)); |
michael@0 | 2222 | } |
michael@0 | 2223 | |
michael@0 | 2224 | static void |
michael@0 | 2225 | sse2_combine_xor_ca (pixman_implementation_t *imp, |
michael@0 | 2226 | pixman_op_t op, |
michael@0 | 2227 | uint32_t * pd, |
michael@0 | 2228 | const uint32_t * ps, |
michael@0 | 2229 | const uint32_t * pm, |
michael@0 | 2230 | int w) |
michael@0 | 2231 | { |
michael@0 | 2232 | uint32_t s, m, d; |
michael@0 | 2233 | |
michael@0 | 2234 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 2235 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2236 | __m128i xmm_alpha_src_lo, xmm_alpha_src_hi; |
michael@0 | 2237 | __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi; |
michael@0 | 2238 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 2239 | |
michael@0 | 2240 | while (w && (uintptr_t)pd & 15) |
michael@0 | 2241 | { |
michael@0 | 2242 | s = *ps++; |
michael@0 | 2243 | m = *pm++; |
michael@0 | 2244 | d = *pd; |
michael@0 | 2245 | |
michael@0 | 2246 | *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); |
michael@0 | 2247 | w--; |
michael@0 | 2248 | } |
michael@0 | 2249 | |
michael@0 | 2250 | while (w >= 4) |
michael@0 | 2251 | { |
michael@0 | 2252 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 2253 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 2254 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 2255 | |
michael@0 | 2256 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2257 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2258 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2259 | |
michael@0 | 2260 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 2261 | &xmm_alpha_src_lo, &xmm_alpha_src_hi); |
michael@0 | 2262 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, |
michael@0 | 2263 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 2264 | |
michael@0 | 2265 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 2266 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2267 | &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2268 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2269 | &xmm_alpha_src_lo, &xmm_alpha_src_hi, |
michael@0 | 2270 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2271 | |
michael@0 | 2272 | negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi, |
michael@0 | 2273 | &xmm_alpha_dst_lo, &xmm_alpha_dst_hi); |
michael@0 | 2274 | negate_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 2275 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2276 | |
michael@0 | 2277 | pix_add_multiply_2x128 ( |
michael@0 | 2278 | &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2279 | &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi, |
michael@0 | 2280 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2281 | |
michael@0 | 2282 | save_128_aligned ( |
michael@0 | 2283 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 2284 | |
michael@0 | 2285 | ps += 4; |
michael@0 | 2286 | pd += 4; |
michael@0 | 2287 | pm += 4; |
michael@0 | 2288 | w -= 4; |
michael@0 | 2289 | } |
michael@0 | 2290 | |
michael@0 | 2291 | while (w) |
michael@0 | 2292 | { |
michael@0 | 2293 | s = *ps++; |
michael@0 | 2294 | m = *pm++; |
michael@0 | 2295 | d = *pd; |
michael@0 | 2296 | |
michael@0 | 2297 | *pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d); |
michael@0 | 2298 | w--; |
michael@0 | 2299 | } |
michael@0 | 2300 | } |
michael@0 | 2301 | |
michael@0 | 2302 | static void |
michael@0 | 2303 | sse2_combine_add_ca (pixman_implementation_t *imp, |
michael@0 | 2304 | pixman_op_t op, |
michael@0 | 2305 | uint32_t * pd, |
michael@0 | 2306 | const uint32_t * ps, |
michael@0 | 2307 | const uint32_t * pm, |
michael@0 | 2308 | int w) |
michael@0 | 2309 | { |
michael@0 | 2310 | uint32_t s, m, d; |
michael@0 | 2311 | |
michael@0 | 2312 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 2313 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2314 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 2315 | |
michael@0 | 2316 | while (w && (uintptr_t)pd & 15) |
michael@0 | 2317 | { |
michael@0 | 2318 | s = *ps++; |
michael@0 | 2319 | m = *pm++; |
michael@0 | 2320 | d = *pd; |
michael@0 | 2321 | |
michael@0 | 2322 | *pd++ = pack_1x128_32 ( |
michael@0 | 2323 | _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), |
michael@0 | 2324 | unpack_32_1x128 (m)), |
michael@0 | 2325 | unpack_32_1x128 (d))); |
michael@0 | 2326 | w--; |
michael@0 | 2327 | } |
michael@0 | 2328 | |
michael@0 | 2329 | while (w >= 4) |
michael@0 | 2330 | { |
michael@0 | 2331 | xmm_src_hi = load_128_unaligned ((__m128i*)ps); |
michael@0 | 2332 | xmm_mask_hi = load_128_unaligned ((__m128i*)pm); |
michael@0 | 2333 | xmm_dst_hi = load_128_aligned ((__m128i*)pd); |
michael@0 | 2334 | |
michael@0 | 2335 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2336 | unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2337 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2338 | |
michael@0 | 2339 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 2340 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2341 | &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2342 | |
michael@0 | 2343 | save_128_aligned ( |
michael@0 | 2344 | (__m128i*)pd, pack_2x128_128 ( |
michael@0 | 2345 | _mm_adds_epu8 (xmm_src_lo, xmm_dst_lo), |
michael@0 | 2346 | _mm_adds_epu8 (xmm_src_hi, xmm_dst_hi))); |
michael@0 | 2347 | |
michael@0 | 2348 | ps += 4; |
michael@0 | 2349 | pd += 4; |
michael@0 | 2350 | pm += 4; |
michael@0 | 2351 | w -= 4; |
michael@0 | 2352 | } |
michael@0 | 2353 | |
michael@0 | 2354 | while (w) |
michael@0 | 2355 | { |
michael@0 | 2356 | s = *ps++; |
michael@0 | 2357 | m = *pm++; |
michael@0 | 2358 | d = *pd; |
michael@0 | 2359 | |
michael@0 | 2360 | *pd++ = pack_1x128_32 ( |
michael@0 | 2361 | _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s), |
michael@0 | 2362 | unpack_32_1x128 (m)), |
michael@0 | 2363 | unpack_32_1x128 (d))); |
michael@0 | 2364 | w--; |
michael@0 | 2365 | } |
michael@0 | 2366 | } |
michael@0 | 2367 | |
michael@0 | 2368 | static force_inline __m128i |
michael@0 | 2369 | create_mask_16_128 (uint16_t mask) |
michael@0 | 2370 | { |
michael@0 | 2371 | return _mm_set1_epi16 (mask); |
michael@0 | 2372 | } |
michael@0 | 2373 | |
michael@0 | 2374 | /* Work around a code generation bug in Sun Studio 12. */ |
michael@0 | 2375 | #if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590) |
michael@0 | 2376 | # define create_mask_2x32_128(mask0, mask1) \ |
michael@0 | 2377 | (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1))) |
michael@0 | 2378 | #else |
michael@0 | 2379 | static force_inline __m128i |
michael@0 | 2380 | create_mask_2x32_128 (uint32_t mask0, |
michael@0 | 2381 | uint32_t mask1) |
michael@0 | 2382 | { |
michael@0 | 2383 | return _mm_set_epi32 (mask0, mask1, mask0, mask1); |
michael@0 | 2384 | } |
michael@0 | 2385 | #endif |
michael@0 | 2386 | |
michael@0 | 2387 | static void |
michael@0 | 2388 | sse2_composite_over_n_8888 (pixman_implementation_t *imp, |
michael@0 | 2389 | pixman_composite_info_t *info) |
michael@0 | 2390 | { |
michael@0 | 2391 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2392 | uint32_t src; |
michael@0 | 2393 | uint32_t *dst_line, *dst, d; |
michael@0 | 2394 | int32_t w; |
michael@0 | 2395 | int dst_stride; |
michael@0 | 2396 | __m128i xmm_src, xmm_alpha; |
michael@0 | 2397 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2398 | |
michael@0 | 2399 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 2400 | |
michael@0 | 2401 | if (src == 0) |
michael@0 | 2402 | return; |
michael@0 | 2403 | |
michael@0 | 2404 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2405 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 2406 | |
michael@0 | 2407 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 2408 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
michael@0 | 2409 | |
michael@0 | 2410 | while (height--) |
michael@0 | 2411 | { |
michael@0 | 2412 | dst = dst_line; |
michael@0 | 2413 | |
michael@0 | 2414 | dst_line += dst_stride; |
michael@0 | 2415 | w = width; |
michael@0 | 2416 | |
michael@0 | 2417 | while (w && (uintptr_t)dst & 15) |
michael@0 | 2418 | { |
michael@0 | 2419 | d = *dst; |
michael@0 | 2420 | *dst++ = pack_1x128_32 (over_1x128 (xmm_src, |
michael@0 | 2421 | xmm_alpha, |
michael@0 | 2422 | unpack_32_1x128 (d))); |
michael@0 | 2423 | w--; |
michael@0 | 2424 | } |
michael@0 | 2425 | |
michael@0 | 2426 | while (w >= 4) |
michael@0 | 2427 | { |
michael@0 | 2428 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 2429 | |
michael@0 | 2430 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2431 | |
michael@0 | 2432 | over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 2433 | &xmm_alpha, &xmm_alpha, |
michael@0 | 2434 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2435 | |
michael@0 | 2436 | /* rebuid the 4 pixel data and save*/ |
michael@0 | 2437 | save_128_aligned ( |
michael@0 | 2438 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 2439 | |
michael@0 | 2440 | w -= 4; |
michael@0 | 2441 | dst += 4; |
michael@0 | 2442 | } |
michael@0 | 2443 | |
michael@0 | 2444 | while (w) |
michael@0 | 2445 | { |
michael@0 | 2446 | d = *dst; |
michael@0 | 2447 | *dst++ = pack_1x128_32 (over_1x128 (xmm_src, |
michael@0 | 2448 | xmm_alpha, |
michael@0 | 2449 | unpack_32_1x128 (d))); |
michael@0 | 2450 | w--; |
michael@0 | 2451 | } |
michael@0 | 2452 | |
michael@0 | 2453 | } |
michael@0 | 2454 | } |
michael@0 | 2455 | |
michael@0 | 2456 | static void |
michael@0 | 2457 | sse2_composite_over_n_0565 (pixman_implementation_t *imp, |
michael@0 | 2458 | pixman_composite_info_t *info) |
michael@0 | 2459 | { |
michael@0 | 2460 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2461 | uint32_t src; |
michael@0 | 2462 | uint16_t *dst_line, *dst, d; |
michael@0 | 2463 | int32_t w; |
michael@0 | 2464 | int dst_stride; |
michael@0 | 2465 | __m128i xmm_src, xmm_alpha; |
michael@0 | 2466 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
michael@0 | 2467 | |
michael@0 | 2468 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 2469 | |
michael@0 | 2470 | if (src == 0) |
michael@0 | 2471 | return; |
michael@0 | 2472 | |
michael@0 | 2473 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2474 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
michael@0 | 2475 | |
michael@0 | 2476 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 2477 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
michael@0 | 2478 | |
michael@0 | 2479 | while (height--) |
michael@0 | 2480 | { |
michael@0 | 2481 | dst = dst_line; |
michael@0 | 2482 | |
michael@0 | 2483 | dst_line += dst_stride; |
michael@0 | 2484 | w = width; |
michael@0 | 2485 | |
michael@0 | 2486 | while (w && (uintptr_t)dst & 15) |
michael@0 | 2487 | { |
michael@0 | 2488 | d = *dst; |
michael@0 | 2489 | |
michael@0 | 2490 | *dst++ = pack_565_32_16 ( |
michael@0 | 2491 | pack_1x128_32 (over_1x128 (xmm_src, |
michael@0 | 2492 | xmm_alpha, |
michael@0 | 2493 | expand565_16_1x128 (d)))); |
michael@0 | 2494 | w--; |
michael@0 | 2495 | } |
michael@0 | 2496 | |
michael@0 | 2497 | while (w >= 8) |
michael@0 | 2498 | { |
michael@0 | 2499 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 2500 | |
michael@0 | 2501 | unpack_565_128_4x128 (xmm_dst, |
michael@0 | 2502 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
michael@0 | 2503 | |
michael@0 | 2504 | over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 2505 | &xmm_alpha, &xmm_alpha, |
michael@0 | 2506 | &xmm_dst0, &xmm_dst1); |
michael@0 | 2507 | over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 2508 | &xmm_alpha, &xmm_alpha, |
michael@0 | 2509 | &xmm_dst2, &xmm_dst3); |
michael@0 | 2510 | |
michael@0 | 2511 | xmm_dst = pack_565_4x128_128 ( |
michael@0 | 2512 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
michael@0 | 2513 | |
michael@0 | 2514 | save_128_aligned ((__m128i*)dst, xmm_dst); |
michael@0 | 2515 | |
michael@0 | 2516 | dst += 8; |
michael@0 | 2517 | w -= 8; |
michael@0 | 2518 | } |
michael@0 | 2519 | |
michael@0 | 2520 | while (w--) |
michael@0 | 2521 | { |
michael@0 | 2522 | d = *dst; |
michael@0 | 2523 | *dst++ = pack_565_32_16 ( |
michael@0 | 2524 | pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha, |
michael@0 | 2525 | expand565_16_1x128 (d)))); |
michael@0 | 2526 | } |
michael@0 | 2527 | } |
michael@0 | 2528 | |
michael@0 | 2529 | } |
michael@0 | 2530 | |
michael@0 | 2531 | static void |
michael@0 | 2532 | sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp, |
michael@0 | 2533 | pixman_composite_info_t *info) |
michael@0 | 2534 | { |
michael@0 | 2535 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2536 | uint32_t src; |
michael@0 | 2537 | uint32_t *dst_line, d; |
michael@0 | 2538 | uint32_t *mask_line, m; |
michael@0 | 2539 | uint32_t pack_cmp; |
michael@0 | 2540 | int dst_stride, mask_stride; |
michael@0 | 2541 | |
michael@0 | 2542 | __m128i xmm_src; |
michael@0 | 2543 | __m128i xmm_dst; |
michael@0 | 2544 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 2545 | |
michael@0 | 2546 | __m128i mmx_src, mmx_mask, mmx_dest; |
michael@0 | 2547 | |
michael@0 | 2548 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 2549 | |
michael@0 | 2550 | if (src == 0) |
michael@0 | 2551 | return; |
michael@0 | 2552 | |
michael@0 | 2553 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2554 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 2555 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2556 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
michael@0 | 2557 | |
michael@0 | 2558 | xmm_src = _mm_unpacklo_epi8 ( |
michael@0 | 2559 | create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); |
michael@0 | 2560 | mmx_src = xmm_src; |
michael@0 | 2561 | |
michael@0 | 2562 | while (height--) |
michael@0 | 2563 | { |
michael@0 | 2564 | int w = width; |
michael@0 | 2565 | const uint32_t *pm = (uint32_t *)mask_line; |
michael@0 | 2566 | uint32_t *pd = (uint32_t *)dst_line; |
michael@0 | 2567 | |
michael@0 | 2568 | dst_line += dst_stride; |
michael@0 | 2569 | mask_line += mask_stride; |
michael@0 | 2570 | |
michael@0 | 2571 | while (w && (uintptr_t)pd & 15) |
michael@0 | 2572 | { |
michael@0 | 2573 | m = *pm++; |
michael@0 | 2574 | |
michael@0 | 2575 | if (m) |
michael@0 | 2576 | { |
michael@0 | 2577 | d = *pd; |
michael@0 | 2578 | |
michael@0 | 2579 | mmx_mask = unpack_32_1x128 (m); |
michael@0 | 2580 | mmx_dest = unpack_32_1x128 (d); |
michael@0 | 2581 | |
michael@0 | 2582 | *pd = pack_1x128_32 ( |
michael@0 | 2583 | _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), |
michael@0 | 2584 | mmx_dest)); |
michael@0 | 2585 | } |
michael@0 | 2586 | |
michael@0 | 2587 | pd++; |
michael@0 | 2588 | w--; |
michael@0 | 2589 | } |
michael@0 | 2590 | |
michael@0 | 2591 | while (w >= 4) |
michael@0 | 2592 | { |
michael@0 | 2593 | xmm_mask = load_128_unaligned ((__m128i*)pm); |
michael@0 | 2594 | |
michael@0 | 2595 | pack_cmp = |
michael@0 | 2596 | _mm_movemask_epi8 ( |
michael@0 | 2597 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
michael@0 | 2598 | |
michael@0 | 2599 | /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ |
michael@0 | 2600 | if (pack_cmp != 0xffff) |
michael@0 | 2601 | { |
michael@0 | 2602 | xmm_dst = load_128_aligned ((__m128i*)pd); |
michael@0 | 2603 | |
michael@0 | 2604 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2605 | |
michael@0 | 2606 | pix_multiply_2x128 (&xmm_src, &xmm_src, |
michael@0 | 2607 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2608 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2609 | xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi); |
michael@0 | 2610 | |
michael@0 | 2611 | save_128_aligned ( |
michael@0 | 2612 | (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst)); |
michael@0 | 2613 | } |
michael@0 | 2614 | |
michael@0 | 2615 | pd += 4; |
michael@0 | 2616 | pm += 4; |
michael@0 | 2617 | w -= 4; |
michael@0 | 2618 | } |
michael@0 | 2619 | |
michael@0 | 2620 | while (w) |
michael@0 | 2621 | { |
michael@0 | 2622 | m = *pm++; |
michael@0 | 2623 | |
michael@0 | 2624 | if (m) |
michael@0 | 2625 | { |
michael@0 | 2626 | d = *pd; |
michael@0 | 2627 | |
michael@0 | 2628 | mmx_mask = unpack_32_1x128 (m); |
michael@0 | 2629 | mmx_dest = unpack_32_1x128 (d); |
michael@0 | 2630 | |
michael@0 | 2631 | *pd = pack_1x128_32 ( |
michael@0 | 2632 | _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src), |
michael@0 | 2633 | mmx_dest)); |
michael@0 | 2634 | } |
michael@0 | 2635 | |
michael@0 | 2636 | pd++; |
michael@0 | 2637 | w--; |
michael@0 | 2638 | } |
michael@0 | 2639 | } |
michael@0 | 2640 | |
michael@0 | 2641 | } |
michael@0 | 2642 | |
michael@0 | 2643 | static void |
michael@0 | 2644 | sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
michael@0 | 2645 | pixman_composite_info_t *info) |
michael@0 | 2646 | { |
michael@0 | 2647 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2648 | uint32_t src; |
michael@0 | 2649 | uint32_t *dst_line, d; |
michael@0 | 2650 | uint32_t *mask_line, m; |
michael@0 | 2651 | uint32_t pack_cmp; |
michael@0 | 2652 | int dst_stride, mask_stride; |
michael@0 | 2653 | |
michael@0 | 2654 | __m128i xmm_src, xmm_alpha; |
michael@0 | 2655 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2656 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 2657 | |
michael@0 | 2658 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
michael@0 | 2659 | |
michael@0 | 2660 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 2661 | |
michael@0 | 2662 | if (src == 0) |
michael@0 | 2663 | return; |
michael@0 | 2664 | |
michael@0 | 2665 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2666 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 2667 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2668 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
michael@0 | 2669 | |
michael@0 | 2670 | xmm_src = _mm_unpacklo_epi8 ( |
michael@0 | 2671 | create_mask_2x32_128 (src, src), _mm_setzero_si128 ()); |
michael@0 | 2672 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
michael@0 | 2673 | mmx_src = xmm_src; |
michael@0 | 2674 | mmx_alpha = xmm_alpha; |
michael@0 | 2675 | |
michael@0 | 2676 | while (height--) |
michael@0 | 2677 | { |
michael@0 | 2678 | int w = width; |
michael@0 | 2679 | const uint32_t *pm = (uint32_t *)mask_line; |
michael@0 | 2680 | uint32_t *pd = (uint32_t *)dst_line; |
michael@0 | 2681 | |
michael@0 | 2682 | dst_line += dst_stride; |
michael@0 | 2683 | mask_line += mask_stride; |
michael@0 | 2684 | |
michael@0 | 2685 | while (w && (uintptr_t)pd & 15) |
michael@0 | 2686 | { |
michael@0 | 2687 | m = *pm++; |
michael@0 | 2688 | |
michael@0 | 2689 | if (m) |
michael@0 | 2690 | { |
michael@0 | 2691 | d = *pd; |
michael@0 | 2692 | mmx_mask = unpack_32_1x128 (m); |
michael@0 | 2693 | mmx_dest = unpack_32_1x128 (d); |
michael@0 | 2694 | |
michael@0 | 2695 | *pd = pack_1x128_32 (in_over_1x128 (&mmx_src, |
michael@0 | 2696 | &mmx_alpha, |
michael@0 | 2697 | &mmx_mask, |
michael@0 | 2698 | &mmx_dest)); |
michael@0 | 2699 | } |
michael@0 | 2700 | |
michael@0 | 2701 | pd++; |
michael@0 | 2702 | w--; |
michael@0 | 2703 | } |
michael@0 | 2704 | |
michael@0 | 2705 | while (w >= 4) |
michael@0 | 2706 | { |
michael@0 | 2707 | xmm_mask = load_128_unaligned ((__m128i*)pm); |
michael@0 | 2708 | |
michael@0 | 2709 | pack_cmp = |
michael@0 | 2710 | _mm_movemask_epi8 ( |
michael@0 | 2711 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
michael@0 | 2712 | |
michael@0 | 2713 | /* if all bits in mask are zero, pack_cmp are equal to 0xffff */ |
michael@0 | 2714 | if (pack_cmp != 0xffff) |
michael@0 | 2715 | { |
michael@0 | 2716 | xmm_dst = load_128_aligned ((__m128i*)pd); |
michael@0 | 2717 | |
michael@0 | 2718 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 2719 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2720 | |
michael@0 | 2721 | in_over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 2722 | &xmm_alpha, &xmm_alpha, |
michael@0 | 2723 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 2724 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2725 | |
michael@0 | 2726 | save_128_aligned ( |
michael@0 | 2727 | (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 2728 | } |
michael@0 | 2729 | |
michael@0 | 2730 | pd += 4; |
michael@0 | 2731 | pm += 4; |
michael@0 | 2732 | w -= 4; |
michael@0 | 2733 | } |
michael@0 | 2734 | |
michael@0 | 2735 | while (w) |
michael@0 | 2736 | { |
michael@0 | 2737 | m = *pm++; |
michael@0 | 2738 | |
michael@0 | 2739 | if (m) |
michael@0 | 2740 | { |
michael@0 | 2741 | d = *pd; |
michael@0 | 2742 | mmx_mask = unpack_32_1x128 (m); |
michael@0 | 2743 | mmx_dest = unpack_32_1x128 (d); |
michael@0 | 2744 | |
michael@0 | 2745 | *pd = pack_1x128_32 ( |
michael@0 | 2746 | in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)); |
michael@0 | 2747 | } |
michael@0 | 2748 | |
michael@0 | 2749 | pd++; |
michael@0 | 2750 | w--; |
michael@0 | 2751 | } |
michael@0 | 2752 | } |
michael@0 | 2753 | |
michael@0 | 2754 | } |
michael@0 | 2755 | |
michael@0 | 2756 | static void |
michael@0 | 2757 | sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
michael@0 | 2758 | pixman_composite_info_t *info) |
michael@0 | 2759 | { |
michael@0 | 2760 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2761 | uint32_t *dst_line, *dst; |
michael@0 | 2762 | uint32_t *src_line, *src; |
michael@0 | 2763 | uint32_t mask; |
michael@0 | 2764 | int32_t w; |
michael@0 | 2765 | int dst_stride, src_stride; |
michael@0 | 2766 | |
michael@0 | 2767 | __m128i xmm_mask; |
michael@0 | 2768 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 2769 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2770 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 2771 | |
michael@0 | 2772 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2773 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 2774 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2775 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 2776 | |
michael@0 | 2777 | mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); |
michael@0 | 2778 | |
michael@0 | 2779 | xmm_mask = create_mask_16_128 (mask >> 24); |
michael@0 | 2780 | |
michael@0 | 2781 | while (height--) |
michael@0 | 2782 | { |
michael@0 | 2783 | dst = dst_line; |
michael@0 | 2784 | dst_line += dst_stride; |
michael@0 | 2785 | src = src_line; |
michael@0 | 2786 | src_line += src_stride; |
michael@0 | 2787 | w = width; |
michael@0 | 2788 | |
michael@0 | 2789 | while (w && (uintptr_t)dst & 15) |
michael@0 | 2790 | { |
michael@0 | 2791 | uint32_t s = *src++; |
michael@0 | 2792 | |
michael@0 | 2793 | if (s) |
michael@0 | 2794 | { |
michael@0 | 2795 | uint32_t d = *dst; |
michael@0 | 2796 | |
michael@0 | 2797 | __m128i ms = unpack_32_1x128 (s); |
michael@0 | 2798 | __m128i alpha = expand_alpha_1x128 (ms); |
michael@0 | 2799 | __m128i dest = xmm_mask; |
michael@0 | 2800 | __m128i alpha_dst = unpack_32_1x128 (d); |
michael@0 | 2801 | |
michael@0 | 2802 | *dst = pack_1x128_32 ( |
michael@0 | 2803 | in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
michael@0 | 2804 | } |
michael@0 | 2805 | dst++; |
michael@0 | 2806 | w--; |
michael@0 | 2807 | } |
michael@0 | 2808 | |
michael@0 | 2809 | while (w >= 4) |
michael@0 | 2810 | { |
michael@0 | 2811 | xmm_src = load_128_unaligned ((__m128i*)src); |
michael@0 | 2812 | |
michael@0 | 2813 | if (!is_zero (xmm_src)) |
michael@0 | 2814 | { |
michael@0 | 2815 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 2816 | |
michael@0 | 2817 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 2818 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2819 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 2820 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 2821 | |
michael@0 | 2822 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 2823 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 2824 | &xmm_mask, &xmm_mask, |
michael@0 | 2825 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 2826 | |
michael@0 | 2827 | save_128_aligned ( |
michael@0 | 2828 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 2829 | } |
michael@0 | 2830 | |
michael@0 | 2831 | dst += 4; |
michael@0 | 2832 | src += 4; |
michael@0 | 2833 | w -= 4; |
michael@0 | 2834 | } |
michael@0 | 2835 | |
michael@0 | 2836 | while (w) |
michael@0 | 2837 | { |
michael@0 | 2838 | uint32_t s = *src++; |
michael@0 | 2839 | |
michael@0 | 2840 | if (s) |
michael@0 | 2841 | { |
michael@0 | 2842 | uint32_t d = *dst; |
michael@0 | 2843 | |
michael@0 | 2844 | __m128i ms = unpack_32_1x128 (s); |
michael@0 | 2845 | __m128i alpha = expand_alpha_1x128 (ms); |
michael@0 | 2846 | __m128i mask = xmm_mask; |
michael@0 | 2847 | __m128i dest = unpack_32_1x128 (d); |
michael@0 | 2848 | |
michael@0 | 2849 | *dst = pack_1x128_32 ( |
michael@0 | 2850 | in_over_1x128 (&ms, &alpha, &mask, &dest)); |
michael@0 | 2851 | } |
michael@0 | 2852 | |
michael@0 | 2853 | dst++; |
michael@0 | 2854 | w--; |
michael@0 | 2855 | } |
michael@0 | 2856 | } |
michael@0 | 2857 | |
michael@0 | 2858 | } |
michael@0 | 2859 | |
michael@0 | 2860 | static void |
michael@0 | 2861 | sse2_composite_src_x888_0565 (pixman_implementation_t *imp, |
michael@0 | 2862 | pixman_composite_info_t *info) |
michael@0 | 2863 | { |
michael@0 | 2864 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2865 | uint16_t *dst_line, *dst; |
michael@0 | 2866 | uint32_t *src_line, *src, s; |
michael@0 | 2867 | int dst_stride, src_stride; |
michael@0 | 2868 | int32_t w; |
michael@0 | 2869 | |
michael@0 | 2870 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 2871 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
michael@0 | 2872 | |
michael@0 | 2873 | while (height--) |
michael@0 | 2874 | { |
michael@0 | 2875 | dst = dst_line; |
michael@0 | 2876 | dst_line += dst_stride; |
michael@0 | 2877 | src = src_line; |
michael@0 | 2878 | src_line += src_stride; |
michael@0 | 2879 | w = width; |
michael@0 | 2880 | |
michael@0 | 2881 | while (w && (uintptr_t)dst & 15) |
michael@0 | 2882 | { |
michael@0 | 2883 | s = *src++; |
michael@0 | 2884 | *dst = convert_8888_to_0565 (s); |
michael@0 | 2885 | dst++; |
michael@0 | 2886 | w--; |
michael@0 | 2887 | } |
michael@0 | 2888 | |
michael@0 | 2889 | while (w >= 8) |
michael@0 | 2890 | { |
michael@0 | 2891 | __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0); |
michael@0 | 2892 | __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1); |
michael@0 | 2893 | |
michael@0 | 2894 | save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1)); |
michael@0 | 2895 | |
michael@0 | 2896 | w -= 8; |
michael@0 | 2897 | src += 8; |
michael@0 | 2898 | dst += 8; |
michael@0 | 2899 | } |
michael@0 | 2900 | |
michael@0 | 2901 | while (w) |
michael@0 | 2902 | { |
michael@0 | 2903 | s = *src++; |
michael@0 | 2904 | *dst = convert_8888_to_0565 (s); |
michael@0 | 2905 | dst++; |
michael@0 | 2906 | w--; |
michael@0 | 2907 | } |
michael@0 | 2908 | } |
michael@0 | 2909 | } |
michael@0 | 2910 | |
michael@0 | 2911 | static void |
michael@0 | 2912 | sse2_composite_src_x888_8888 (pixman_implementation_t *imp, |
michael@0 | 2913 | pixman_composite_info_t *info) |
michael@0 | 2914 | { |
michael@0 | 2915 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2916 | uint32_t *dst_line, *dst; |
michael@0 | 2917 | uint32_t *src_line, *src; |
michael@0 | 2918 | int32_t w; |
michael@0 | 2919 | int dst_stride, src_stride; |
michael@0 | 2920 | |
michael@0 | 2921 | |
michael@0 | 2922 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2923 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 2924 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2925 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 2926 | |
michael@0 | 2927 | while (height--) |
michael@0 | 2928 | { |
michael@0 | 2929 | dst = dst_line; |
michael@0 | 2930 | dst_line += dst_stride; |
michael@0 | 2931 | src = src_line; |
michael@0 | 2932 | src_line += src_stride; |
michael@0 | 2933 | w = width; |
michael@0 | 2934 | |
michael@0 | 2935 | while (w && (uintptr_t)dst & 15) |
michael@0 | 2936 | { |
michael@0 | 2937 | *dst++ = *src++ | 0xff000000; |
michael@0 | 2938 | w--; |
michael@0 | 2939 | } |
michael@0 | 2940 | |
michael@0 | 2941 | while (w >= 16) |
michael@0 | 2942 | { |
michael@0 | 2943 | __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4; |
michael@0 | 2944 | |
michael@0 | 2945 | xmm_src1 = load_128_unaligned ((__m128i*)src + 0); |
michael@0 | 2946 | xmm_src2 = load_128_unaligned ((__m128i*)src + 1); |
michael@0 | 2947 | xmm_src3 = load_128_unaligned ((__m128i*)src + 2); |
michael@0 | 2948 | xmm_src4 = load_128_unaligned ((__m128i*)src + 3); |
michael@0 | 2949 | |
michael@0 | 2950 | save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000)); |
michael@0 | 2951 | save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000)); |
michael@0 | 2952 | save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000)); |
michael@0 | 2953 | save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000)); |
michael@0 | 2954 | |
michael@0 | 2955 | dst += 16; |
michael@0 | 2956 | src += 16; |
michael@0 | 2957 | w -= 16; |
michael@0 | 2958 | } |
michael@0 | 2959 | |
michael@0 | 2960 | while (w) |
michael@0 | 2961 | { |
michael@0 | 2962 | *dst++ = *src++ | 0xff000000; |
michael@0 | 2963 | w--; |
michael@0 | 2964 | } |
michael@0 | 2965 | } |
michael@0 | 2966 | |
michael@0 | 2967 | } |
michael@0 | 2968 | |
michael@0 | 2969 | static void |
michael@0 | 2970 | sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
michael@0 | 2971 | pixman_composite_info_t *info) |
michael@0 | 2972 | { |
michael@0 | 2973 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 2974 | uint32_t *dst_line, *dst; |
michael@0 | 2975 | uint32_t *src_line, *src; |
michael@0 | 2976 | uint32_t mask; |
michael@0 | 2977 | int dst_stride, src_stride; |
michael@0 | 2978 | int32_t w; |
michael@0 | 2979 | |
michael@0 | 2980 | __m128i xmm_mask, xmm_alpha; |
michael@0 | 2981 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 2982 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 2983 | |
michael@0 | 2984 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2985 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 2986 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 2987 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 2988 | |
michael@0 | 2989 | mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8); |
michael@0 | 2990 | |
michael@0 | 2991 | xmm_mask = create_mask_16_128 (mask >> 24); |
michael@0 | 2992 | xmm_alpha = mask_00ff; |
michael@0 | 2993 | |
michael@0 | 2994 | while (height--) |
michael@0 | 2995 | { |
michael@0 | 2996 | dst = dst_line; |
michael@0 | 2997 | dst_line += dst_stride; |
michael@0 | 2998 | src = src_line; |
michael@0 | 2999 | src_line += src_stride; |
michael@0 | 3000 | w = width; |
michael@0 | 3001 | |
michael@0 | 3002 | while (w && (uintptr_t)dst & 15) |
michael@0 | 3003 | { |
michael@0 | 3004 | uint32_t s = (*src++) | 0xff000000; |
michael@0 | 3005 | uint32_t d = *dst; |
michael@0 | 3006 | |
michael@0 | 3007 | __m128i src = unpack_32_1x128 (s); |
michael@0 | 3008 | __m128i alpha = xmm_alpha; |
michael@0 | 3009 | __m128i mask = xmm_mask; |
michael@0 | 3010 | __m128i dest = unpack_32_1x128 (d); |
michael@0 | 3011 | |
michael@0 | 3012 | *dst++ = pack_1x128_32 ( |
michael@0 | 3013 | in_over_1x128 (&src, &alpha, &mask, &dest)); |
michael@0 | 3014 | |
michael@0 | 3015 | w--; |
michael@0 | 3016 | } |
michael@0 | 3017 | |
michael@0 | 3018 | while (w >= 4) |
michael@0 | 3019 | { |
michael@0 | 3020 | xmm_src = _mm_or_si128 ( |
michael@0 | 3021 | load_128_unaligned ((__m128i*)src), mask_ff000000); |
michael@0 | 3022 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 3023 | |
michael@0 | 3024 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 3025 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3026 | |
michael@0 | 3027 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 3028 | &xmm_alpha, &xmm_alpha, |
michael@0 | 3029 | &xmm_mask, &xmm_mask, |
michael@0 | 3030 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3031 | |
michael@0 | 3032 | save_128_aligned ( |
michael@0 | 3033 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 3034 | |
michael@0 | 3035 | dst += 4; |
michael@0 | 3036 | src += 4; |
michael@0 | 3037 | w -= 4; |
michael@0 | 3038 | |
michael@0 | 3039 | } |
michael@0 | 3040 | |
michael@0 | 3041 | while (w) |
michael@0 | 3042 | { |
michael@0 | 3043 | uint32_t s = (*src++) | 0xff000000; |
michael@0 | 3044 | uint32_t d = *dst; |
michael@0 | 3045 | |
michael@0 | 3046 | __m128i src = unpack_32_1x128 (s); |
michael@0 | 3047 | __m128i alpha = xmm_alpha; |
michael@0 | 3048 | __m128i mask = xmm_mask; |
michael@0 | 3049 | __m128i dest = unpack_32_1x128 (d); |
michael@0 | 3050 | |
michael@0 | 3051 | *dst++ = pack_1x128_32 ( |
michael@0 | 3052 | in_over_1x128 (&src, &alpha, &mask, &dest)); |
michael@0 | 3053 | |
michael@0 | 3054 | w--; |
michael@0 | 3055 | } |
michael@0 | 3056 | } |
michael@0 | 3057 | |
michael@0 | 3058 | } |
michael@0 | 3059 | |
michael@0 | 3060 | static void |
michael@0 | 3061 | sse2_composite_over_8888_8888 (pixman_implementation_t *imp, |
michael@0 | 3062 | pixman_composite_info_t *info) |
michael@0 | 3063 | { |
michael@0 | 3064 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3065 | int dst_stride, src_stride; |
michael@0 | 3066 | uint32_t *dst_line, *dst; |
michael@0 | 3067 | uint32_t *src_line, *src; |
michael@0 | 3068 | |
michael@0 | 3069 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3070 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 3071 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3072 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 3073 | |
michael@0 | 3074 | dst = dst_line; |
michael@0 | 3075 | src = src_line; |
michael@0 | 3076 | |
michael@0 | 3077 | while (height--) |
michael@0 | 3078 | { |
michael@0 | 3079 | sse2_combine_over_u (imp, op, dst, src, NULL, width); |
michael@0 | 3080 | |
michael@0 | 3081 | dst += dst_stride; |
michael@0 | 3082 | src += src_stride; |
michael@0 | 3083 | } |
michael@0 | 3084 | } |
michael@0 | 3085 | |
michael@0 | 3086 | static force_inline uint16_t |
michael@0 | 3087 | composite_over_8888_0565pixel (uint32_t src, uint16_t dst) |
michael@0 | 3088 | { |
michael@0 | 3089 | __m128i ms; |
michael@0 | 3090 | |
michael@0 | 3091 | ms = unpack_32_1x128 (src); |
michael@0 | 3092 | return pack_565_32_16 ( |
michael@0 | 3093 | pack_1x128_32 ( |
michael@0 | 3094 | over_1x128 ( |
michael@0 | 3095 | ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst)))); |
michael@0 | 3096 | } |
michael@0 | 3097 | |
michael@0 | 3098 | static void |
michael@0 | 3099 | sse2_composite_over_8888_0565 (pixman_implementation_t *imp, |
michael@0 | 3100 | pixman_composite_info_t *info) |
michael@0 | 3101 | { |
michael@0 | 3102 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3103 | uint16_t *dst_line, *dst, d; |
michael@0 | 3104 | uint32_t *src_line, *src, s; |
michael@0 | 3105 | int dst_stride, src_stride; |
michael@0 | 3106 | int32_t w; |
michael@0 | 3107 | |
michael@0 | 3108 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 3109 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 3110 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
michael@0 | 3111 | |
michael@0 | 3112 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3113 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
michael@0 | 3114 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3115 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 3116 | |
michael@0 | 3117 | while (height--) |
michael@0 | 3118 | { |
michael@0 | 3119 | dst = dst_line; |
michael@0 | 3120 | src = src_line; |
michael@0 | 3121 | |
michael@0 | 3122 | dst_line += dst_stride; |
michael@0 | 3123 | src_line += src_stride; |
michael@0 | 3124 | w = width; |
michael@0 | 3125 | |
michael@0 | 3126 | /* Align dst on a 16-byte boundary */ |
michael@0 | 3127 | while (w && |
michael@0 | 3128 | ((uintptr_t)dst & 15)) |
michael@0 | 3129 | { |
michael@0 | 3130 | s = *src++; |
michael@0 | 3131 | d = *dst; |
michael@0 | 3132 | |
michael@0 | 3133 | *dst++ = composite_over_8888_0565pixel (s, d); |
michael@0 | 3134 | w--; |
michael@0 | 3135 | } |
michael@0 | 3136 | |
michael@0 | 3137 | /* It's a 8 pixel loop */ |
michael@0 | 3138 | while (w >= 8) |
michael@0 | 3139 | { |
michael@0 | 3140 | /* I'm loading unaligned because I'm not sure |
michael@0 | 3141 | * about the address alignment. |
michael@0 | 3142 | */ |
michael@0 | 3143 | xmm_src = load_128_unaligned ((__m128i*) src); |
michael@0 | 3144 | xmm_dst = load_128_aligned ((__m128i*) dst); |
michael@0 | 3145 | |
michael@0 | 3146 | /* Unpacking */ |
michael@0 | 3147 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 3148 | unpack_565_128_4x128 (xmm_dst, |
michael@0 | 3149 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
michael@0 | 3150 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3151 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 3152 | |
michael@0 | 3153 | /* I'm loading next 4 pixels from memory |
michael@0 | 3154 | * before to optimze the memory read. |
michael@0 | 3155 | */ |
michael@0 | 3156 | xmm_src = load_128_unaligned ((__m128i*) (src + 4)); |
michael@0 | 3157 | |
michael@0 | 3158 | over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 3159 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 3160 | &xmm_dst0, &xmm_dst1); |
michael@0 | 3161 | |
michael@0 | 3162 | /* Unpacking */ |
michael@0 | 3163 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 3164 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3165 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 3166 | |
michael@0 | 3167 | over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 3168 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 3169 | &xmm_dst2, &xmm_dst3); |
michael@0 | 3170 | |
michael@0 | 3171 | save_128_aligned ( |
michael@0 | 3172 | (__m128i*)dst, pack_565_4x128_128 ( |
michael@0 | 3173 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
michael@0 | 3174 | |
michael@0 | 3175 | w -= 8; |
michael@0 | 3176 | dst += 8; |
michael@0 | 3177 | src += 8; |
michael@0 | 3178 | } |
michael@0 | 3179 | |
michael@0 | 3180 | while (w--) |
michael@0 | 3181 | { |
michael@0 | 3182 | s = *src++; |
michael@0 | 3183 | d = *dst; |
michael@0 | 3184 | |
michael@0 | 3185 | *dst++ = composite_over_8888_0565pixel (s, d); |
michael@0 | 3186 | } |
michael@0 | 3187 | } |
michael@0 | 3188 | |
michael@0 | 3189 | } |
michael@0 | 3190 | |
michael@0 | 3191 | static void |
michael@0 | 3192 | sse2_composite_over_n_8_8888 (pixman_implementation_t *imp, |
michael@0 | 3193 | pixman_composite_info_t *info) |
michael@0 | 3194 | { |
michael@0 | 3195 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3196 | uint32_t src, srca; |
michael@0 | 3197 | uint32_t *dst_line, *dst; |
michael@0 | 3198 | uint8_t *mask_line, *mask; |
michael@0 | 3199 | int dst_stride, mask_stride; |
michael@0 | 3200 | int32_t w; |
michael@0 | 3201 | uint32_t m, d; |
michael@0 | 3202 | |
michael@0 | 3203 | __m128i xmm_src, xmm_alpha, xmm_def; |
michael@0 | 3204 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 3205 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 3206 | |
michael@0 | 3207 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
michael@0 | 3208 | |
michael@0 | 3209 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 3210 | |
michael@0 | 3211 | srca = src >> 24; |
michael@0 | 3212 | if (src == 0) |
michael@0 | 3213 | return; |
michael@0 | 3214 | |
michael@0 | 3215 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3216 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 3217 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3218 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 3219 | |
michael@0 | 3220 | xmm_def = create_mask_2x32_128 (src, src); |
michael@0 | 3221 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 3222 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
michael@0 | 3223 | mmx_src = xmm_src; |
michael@0 | 3224 | mmx_alpha = xmm_alpha; |
michael@0 | 3225 | |
michael@0 | 3226 | while (height--) |
michael@0 | 3227 | { |
michael@0 | 3228 | dst = dst_line; |
michael@0 | 3229 | dst_line += dst_stride; |
michael@0 | 3230 | mask = mask_line; |
michael@0 | 3231 | mask_line += mask_stride; |
michael@0 | 3232 | w = width; |
michael@0 | 3233 | |
michael@0 | 3234 | while (w && (uintptr_t)dst & 15) |
michael@0 | 3235 | { |
michael@0 | 3236 | uint8_t m = *mask++; |
michael@0 | 3237 | |
michael@0 | 3238 | if (m) |
michael@0 | 3239 | { |
michael@0 | 3240 | d = *dst; |
michael@0 | 3241 | mmx_mask = expand_pixel_8_1x128 (m); |
michael@0 | 3242 | mmx_dest = unpack_32_1x128 (d); |
michael@0 | 3243 | |
michael@0 | 3244 | *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, |
michael@0 | 3245 | &mmx_alpha, |
michael@0 | 3246 | &mmx_mask, |
michael@0 | 3247 | &mmx_dest)); |
michael@0 | 3248 | } |
michael@0 | 3249 | |
michael@0 | 3250 | w--; |
michael@0 | 3251 | dst++; |
michael@0 | 3252 | } |
michael@0 | 3253 | |
michael@0 | 3254 | while (w >= 4) |
michael@0 | 3255 | { |
michael@0 | 3256 | m = *((uint32_t*)mask); |
michael@0 | 3257 | |
michael@0 | 3258 | if (srca == 0xff && m == 0xffffffff) |
michael@0 | 3259 | { |
michael@0 | 3260 | save_128_aligned ((__m128i*)dst, xmm_def); |
michael@0 | 3261 | } |
michael@0 | 3262 | else if (m) |
michael@0 | 3263 | { |
michael@0 | 3264 | xmm_dst = load_128_aligned ((__m128i*) dst); |
michael@0 | 3265 | xmm_mask = unpack_32_1x128 (m); |
michael@0 | 3266 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
michael@0 | 3267 | |
michael@0 | 3268 | /* Unpacking */ |
michael@0 | 3269 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3270 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3271 | |
michael@0 | 3272 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 3273 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3274 | |
michael@0 | 3275 | in_over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 3276 | &xmm_alpha, &xmm_alpha, |
michael@0 | 3277 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 3278 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3279 | |
michael@0 | 3280 | save_128_aligned ( |
michael@0 | 3281 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 3282 | } |
michael@0 | 3283 | |
michael@0 | 3284 | w -= 4; |
michael@0 | 3285 | dst += 4; |
michael@0 | 3286 | mask += 4; |
michael@0 | 3287 | } |
michael@0 | 3288 | |
michael@0 | 3289 | while (w) |
michael@0 | 3290 | { |
michael@0 | 3291 | uint8_t m = *mask++; |
michael@0 | 3292 | |
michael@0 | 3293 | if (m) |
michael@0 | 3294 | { |
michael@0 | 3295 | d = *dst; |
michael@0 | 3296 | mmx_mask = expand_pixel_8_1x128 (m); |
michael@0 | 3297 | mmx_dest = unpack_32_1x128 (d); |
michael@0 | 3298 | |
michael@0 | 3299 | *dst = pack_1x128_32 (in_over_1x128 (&mmx_src, |
michael@0 | 3300 | &mmx_alpha, |
michael@0 | 3301 | &mmx_mask, |
michael@0 | 3302 | &mmx_dest)); |
michael@0 | 3303 | } |
michael@0 | 3304 | |
michael@0 | 3305 | w--; |
michael@0 | 3306 | dst++; |
michael@0 | 3307 | } |
michael@0 | 3308 | } |
michael@0 | 3309 | |
michael@0 | 3310 | } |
michael@0 | 3311 | |
michael@0 | 3312 | #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) |
michael@0 | 3313 | __attribute__((__force_align_arg_pointer__)) |
michael@0 | 3314 | #endif |
michael@0 | 3315 | static pixman_bool_t |
michael@0 | 3316 | sse2_fill (pixman_implementation_t *imp, |
michael@0 | 3317 | uint32_t * bits, |
michael@0 | 3318 | int stride, |
michael@0 | 3319 | int bpp, |
michael@0 | 3320 | int x, |
michael@0 | 3321 | int y, |
michael@0 | 3322 | int width, |
michael@0 | 3323 | int height, |
michael@0 | 3324 | uint32_t filler) |
michael@0 | 3325 | { |
michael@0 | 3326 | uint32_t byte_width; |
michael@0 | 3327 | uint8_t *byte_line; |
michael@0 | 3328 | |
michael@0 | 3329 | __m128i xmm_def; |
michael@0 | 3330 | |
michael@0 | 3331 | if (bpp == 8) |
michael@0 | 3332 | { |
michael@0 | 3333 | uint8_t b; |
michael@0 | 3334 | uint16_t w; |
michael@0 | 3335 | |
michael@0 | 3336 | stride = stride * (int) sizeof (uint32_t) / 1; |
michael@0 | 3337 | byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
michael@0 | 3338 | byte_width = width; |
michael@0 | 3339 | stride *= 1; |
michael@0 | 3340 | |
michael@0 | 3341 | b = filler & 0xff; |
michael@0 | 3342 | w = (b << 8) | b; |
michael@0 | 3343 | filler = (w << 16) | w; |
michael@0 | 3344 | } |
michael@0 | 3345 | else if (bpp == 16) |
michael@0 | 3346 | { |
michael@0 | 3347 | stride = stride * (int) sizeof (uint32_t) / 2; |
michael@0 | 3348 | byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
michael@0 | 3349 | byte_width = 2 * width; |
michael@0 | 3350 | stride *= 2; |
michael@0 | 3351 | |
michael@0 | 3352 | filler = (filler & 0xffff) * 0x00010001; |
michael@0 | 3353 | } |
michael@0 | 3354 | else if (bpp == 32) |
michael@0 | 3355 | { |
michael@0 | 3356 | stride = stride * (int) sizeof (uint32_t) / 4; |
michael@0 | 3357 | byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
michael@0 | 3358 | byte_width = 4 * width; |
michael@0 | 3359 | stride *= 4; |
michael@0 | 3360 | } |
michael@0 | 3361 | else |
michael@0 | 3362 | { |
michael@0 | 3363 | return FALSE; |
michael@0 | 3364 | } |
michael@0 | 3365 | |
michael@0 | 3366 | xmm_def = create_mask_2x32_128 (filler, filler); |
michael@0 | 3367 | |
michael@0 | 3368 | while (height--) |
michael@0 | 3369 | { |
michael@0 | 3370 | int w; |
michael@0 | 3371 | uint8_t *d = byte_line; |
michael@0 | 3372 | byte_line += stride; |
michael@0 | 3373 | w = byte_width; |
michael@0 | 3374 | |
michael@0 | 3375 | if (w >= 1 && ((uintptr_t)d & 1)) |
michael@0 | 3376 | { |
michael@0 | 3377 | *(uint8_t *)d = filler; |
michael@0 | 3378 | w -= 1; |
michael@0 | 3379 | d += 1; |
michael@0 | 3380 | } |
michael@0 | 3381 | |
michael@0 | 3382 | while (w >= 2 && ((uintptr_t)d & 3)) |
michael@0 | 3383 | { |
michael@0 | 3384 | *(uint16_t *)d = filler; |
michael@0 | 3385 | w -= 2; |
michael@0 | 3386 | d += 2; |
michael@0 | 3387 | } |
michael@0 | 3388 | |
michael@0 | 3389 | while (w >= 4 && ((uintptr_t)d & 15)) |
michael@0 | 3390 | { |
michael@0 | 3391 | *(uint32_t *)d = filler; |
michael@0 | 3392 | |
michael@0 | 3393 | w -= 4; |
michael@0 | 3394 | d += 4; |
michael@0 | 3395 | } |
michael@0 | 3396 | |
michael@0 | 3397 | while (w >= 128) |
michael@0 | 3398 | { |
michael@0 | 3399 | save_128_aligned ((__m128i*)(d), xmm_def); |
michael@0 | 3400 | save_128_aligned ((__m128i*)(d + 16), xmm_def); |
michael@0 | 3401 | save_128_aligned ((__m128i*)(d + 32), xmm_def); |
michael@0 | 3402 | save_128_aligned ((__m128i*)(d + 48), xmm_def); |
michael@0 | 3403 | save_128_aligned ((__m128i*)(d + 64), xmm_def); |
michael@0 | 3404 | save_128_aligned ((__m128i*)(d + 80), xmm_def); |
michael@0 | 3405 | save_128_aligned ((__m128i*)(d + 96), xmm_def); |
michael@0 | 3406 | save_128_aligned ((__m128i*)(d + 112), xmm_def); |
michael@0 | 3407 | |
michael@0 | 3408 | d += 128; |
michael@0 | 3409 | w -= 128; |
michael@0 | 3410 | } |
michael@0 | 3411 | |
michael@0 | 3412 | if (w >= 64) |
michael@0 | 3413 | { |
michael@0 | 3414 | save_128_aligned ((__m128i*)(d), xmm_def); |
michael@0 | 3415 | save_128_aligned ((__m128i*)(d + 16), xmm_def); |
michael@0 | 3416 | save_128_aligned ((__m128i*)(d + 32), xmm_def); |
michael@0 | 3417 | save_128_aligned ((__m128i*)(d + 48), xmm_def); |
michael@0 | 3418 | |
michael@0 | 3419 | d += 64; |
michael@0 | 3420 | w -= 64; |
michael@0 | 3421 | } |
michael@0 | 3422 | |
michael@0 | 3423 | if (w >= 32) |
michael@0 | 3424 | { |
michael@0 | 3425 | save_128_aligned ((__m128i*)(d), xmm_def); |
michael@0 | 3426 | save_128_aligned ((__m128i*)(d + 16), xmm_def); |
michael@0 | 3427 | |
michael@0 | 3428 | d += 32; |
michael@0 | 3429 | w -= 32; |
michael@0 | 3430 | } |
michael@0 | 3431 | |
michael@0 | 3432 | if (w >= 16) |
michael@0 | 3433 | { |
michael@0 | 3434 | save_128_aligned ((__m128i*)(d), xmm_def); |
michael@0 | 3435 | |
michael@0 | 3436 | d += 16; |
michael@0 | 3437 | w -= 16; |
michael@0 | 3438 | } |
michael@0 | 3439 | |
michael@0 | 3440 | while (w >= 4) |
michael@0 | 3441 | { |
michael@0 | 3442 | *(uint32_t *)d = filler; |
michael@0 | 3443 | |
michael@0 | 3444 | w -= 4; |
michael@0 | 3445 | d += 4; |
michael@0 | 3446 | } |
michael@0 | 3447 | |
michael@0 | 3448 | if (w >= 2) |
michael@0 | 3449 | { |
michael@0 | 3450 | *(uint16_t *)d = filler; |
michael@0 | 3451 | w -= 2; |
michael@0 | 3452 | d += 2; |
michael@0 | 3453 | } |
michael@0 | 3454 | |
michael@0 | 3455 | if (w >= 1) |
michael@0 | 3456 | { |
michael@0 | 3457 | *(uint8_t *)d = filler; |
michael@0 | 3458 | w -= 1; |
michael@0 | 3459 | d += 1; |
michael@0 | 3460 | } |
michael@0 | 3461 | } |
michael@0 | 3462 | |
michael@0 | 3463 | return TRUE; |
michael@0 | 3464 | } |
michael@0 | 3465 | |
michael@0 | 3466 | static void |
michael@0 | 3467 | sse2_composite_src_n_8_8888 (pixman_implementation_t *imp, |
michael@0 | 3468 | pixman_composite_info_t *info) |
michael@0 | 3469 | { |
michael@0 | 3470 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3471 | uint32_t src, srca; |
michael@0 | 3472 | uint32_t *dst_line, *dst; |
michael@0 | 3473 | uint8_t *mask_line, *mask; |
michael@0 | 3474 | int dst_stride, mask_stride; |
michael@0 | 3475 | int32_t w; |
michael@0 | 3476 | uint32_t m; |
michael@0 | 3477 | |
michael@0 | 3478 | __m128i xmm_src, xmm_def; |
michael@0 | 3479 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 3480 | |
michael@0 | 3481 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 3482 | |
michael@0 | 3483 | srca = src >> 24; |
michael@0 | 3484 | if (src == 0) |
michael@0 | 3485 | { |
michael@0 | 3486 | sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
michael@0 | 3487 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
michael@0 | 3488 | dest_x, dest_y, width, height, 0); |
michael@0 | 3489 | return; |
michael@0 | 3490 | } |
michael@0 | 3491 | |
michael@0 | 3492 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3493 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 3494 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3495 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 3496 | |
michael@0 | 3497 | xmm_def = create_mask_2x32_128 (src, src); |
michael@0 | 3498 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 3499 | |
michael@0 | 3500 | while (height--) |
michael@0 | 3501 | { |
michael@0 | 3502 | dst = dst_line; |
michael@0 | 3503 | dst_line += dst_stride; |
michael@0 | 3504 | mask = mask_line; |
michael@0 | 3505 | mask_line += mask_stride; |
michael@0 | 3506 | w = width; |
michael@0 | 3507 | |
michael@0 | 3508 | while (w && (uintptr_t)dst & 15) |
michael@0 | 3509 | { |
michael@0 | 3510 | uint8_t m = *mask++; |
michael@0 | 3511 | |
michael@0 | 3512 | if (m) |
michael@0 | 3513 | { |
michael@0 | 3514 | *dst = pack_1x128_32 ( |
michael@0 | 3515 | pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m))); |
michael@0 | 3516 | } |
michael@0 | 3517 | else |
michael@0 | 3518 | { |
michael@0 | 3519 | *dst = 0; |
michael@0 | 3520 | } |
michael@0 | 3521 | |
michael@0 | 3522 | w--; |
michael@0 | 3523 | dst++; |
michael@0 | 3524 | } |
michael@0 | 3525 | |
michael@0 | 3526 | while (w >= 4) |
michael@0 | 3527 | { |
michael@0 | 3528 | m = *((uint32_t*)mask); |
michael@0 | 3529 | |
michael@0 | 3530 | if (srca == 0xff && m == 0xffffffff) |
michael@0 | 3531 | { |
michael@0 | 3532 | save_128_aligned ((__m128i*)dst, xmm_def); |
michael@0 | 3533 | } |
michael@0 | 3534 | else if (m) |
michael@0 | 3535 | { |
michael@0 | 3536 | xmm_mask = unpack_32_1x128 (m); |
michael@0 | 3537 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
michael@0 | 3538 | |
michael@0 | 3539 | /* Unpacking */ |
michael@0 | 3540 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3541 | |
michael@0 | 3542 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 3543 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3544 | |
michael@0 | 3545 | pix_multiply_2x128 (&xmm_src, &xmm_src, |
michael@0 | 3546 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 3547 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3548 | |
michael@0 | 3549 | save_128_aligned ( |
michael@0 | 3550 | (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi)); |
michael@0 | 3551 | } |
michael@0 | 3552 | else |
michael@0 | 3553 | { |
michael@0 | 3554 | save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ()); |
michael@0 | 3555 | } |
michael@0 | 3556 | |
michael@0 | 3557 | w -= 4; |
michael@0 | 3558 | dst += 4; |
michael@0 | 3559 | mask += 4; |
michael@0 | 3560 | } |
michael@0 | 3561 | |
michael@0 | 3562 | while (w) |
michael@0 | 3563 | { |
michael@0 | 3564 | uint8_t m = *mask++; |
michael@0 | 3565 | |
michael@0 | 3566 | if (m) |
michael@0 | 3567 | { |
michael@0 | 3568 | *dst = pack_1x128_32 ( |
michael@0 | 3569 | pix_multiply_1x128 ( |
michael@0 | 3570 | xmm_src, expand_pixel_8_1x128 (m))); |
michael@0 | 3571 | } |
michael@0 | 3572 | else |
michael@0 | 3573 | { |
michael@0 | 3574 | *dst = 0; |
michael@0 | 3575 | } |
michael@0 | 3576 | |
michael@0 | 3577 | w--; |
michael@0 | 3578 | dst++; |
michael@0 | 3579 | } |
michael@0 | 3580 | } |
michael@0 | 3581 | |
michael@0 | 3582 | } |
michael@0 | 3583 | |
michael@0 | 3584 | static void |
michael@0 | 3585 | sse2_composite_over_n_8_0565 (pixman_implementation_t *imp, |
michael@0 | 3586 | pixman_composite_info_t *info) |
michael@0 | 3587 | { |
michael@0 | 3588 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3589 | uint32_t src; |
michael@0 | 3590 | uint16_t *dst_line, *dst, d; |
michael@0 | 3591 | uint8_t *mask_line, *mask; |
michael@0 | 3592 | int dst_stride, mask_stride; |
michael@0 | 3593 | int32_t w; |
michael@0 | 3594 | uint32_t m; |
michael@0 | 3595 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
michael@0 | 3596 | |
michael@0 | 3597 | __m128i xmm_src, xmm_alpha; |
michael@0 | 3598 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 3599 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
michael@0 | 3600 | |
michael@0 | 3601 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 3602 | |
michael@0 | 3603 | if (src == 0) |
michael@0 | 3604 | return; |
michael@0 | 3605 | |
michael@0 | 3606 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3607 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
michael@0 | 3608 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3609 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 3610 | |
michael@0 | 3611 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 3612 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
michael@0 | 3613 | mmx_src = xmm_src; |
michael@0 | 3614 | mmx_alpha = xmm_alpha; |
michael@0 | 3615 | |
michael@0 | 3616 | while (height--) |
michael@0 | 3617 | { |
michael@0 | 3618 | dst = dst_line; |
michael@0 | 3619 | dst_line += dst_stride; |
michael@0 | 3620 | mask = mask_line; |
michael@0 | 3621 | mask_line += mask_stride; |
michael@0 | 3622 | w = width; |
michael@0 | 3623 | |
michael@0 | 3624 | while (w && (uintptr_t)dst & 15) |
michael@0 | 3625 | { |
michael@0 | 3626 | m = *mask++; |
michael@0 | 3627 | |
michael@0 | 3628 | if (m) |
michael@0 | 3629 | { |
michael@0 | 3630 | d = *dst; |
michael@0 | 3631 | mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
michael@0 | 3632 | mmx_dest = expand565_16_1x128 (d); |
michael@0 | 3633 | |
michael@0 | 3634 | *dst = pack_565_32_16 ( |
michael@0 | 3635 | pack_1x128_32 ( |
michael@0 | 3636 | in_over_1x128 ( |
michael@0 | 3637 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
michael@0 | 3638 | } |
michael@0 | 3639 | |
michael@0 | 3640 | w--; |
michael@0 | 3641 | dst++; |
michael@0 | 3642 | } |
michael@0 | 3643 | |
michael@0 | 3644 | while (w >= 8) |
michael@0 | 3645 | { |
michael@0 | 3646 | xmm_dst = load_128_aligned ((__m128i*) dst); |
michael@0 | 3647 | unpack_565_128_4x128 (xmm_dst, |
michael@0 | 3648 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
michael@0 | 3649 | |
michael@0 | 3650 | m = *((uint32_t*)mask); |
michael@0 | 3651 | mask += 4; |
michael@0 | 3652 | |
michael@0 | 3653 | if (m) |
michael@0 | 3654 | { |
michael@0 | 3655 | xmm_mask = unpack_32_1x128 (m); |
michael@0 | 3656 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
michael@0 | 3657 | |
michael@0 | 3658 | /* Unpacking */ |
michael@0 | 3659 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3660 | |
michael@0 | 3661 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 3662 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3663 | |
michael@0 | 3664 | in_over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 3665 | &xmm_alpha, &xmm_alpha, |
michael@0 | 3666 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 3667 | &xmm_dst0, &xmm_dst1); |
michael@0 | 3668 | } |
michael@0 | 3669 | |
michael@0 | 3670 | m = *((uint32_t*)mask); |
michael@0 | 3671 | mask += 4; |
michael@0 | 3672 | |
michael@0 | 3673 | if (m) |
michael@0 | 3674 | { |
michael@0 | 3675 | xmm_mask = unpack_32_1x128 (m); |
michael@0 | 3676 | xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ()); |
michael@0 | 3677 | |
michael@0 | 3678 | /* Unpacking */ |
michael@0 | 3679 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3680 | |
michael@0 | 3681 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 3682 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3683 | in_over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 3684 | &xmm_alpha, &xmm_alpha, |
michael@0 | 3685 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 3686 | &xmm_dst2, &xmm_dst3); |
michael@0 | 3687 | } |
michael@0 | 3688 | |
michael@0 | 3689 | save_128_aligned ( |
michael@0 | 3690 | (__m128i*)dst, pack_565_4x128_128 ( |
michael@0 | 3691 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
michael@0 | 3692 | |
michael@0 | 3693 | w -= 8; |
michael@0 | 3694 | dst += 8; |
michael@0 | 3695 | } |
michael@0 | 3696 | |
michael@0 | 3697 | while (w) |
michael@0 | 3698 | { |
michael@0 | 3699 | m = *mask++; |
michael@0 | 3700 | |
michael@0 | 3701 | if (m) |
michael@0 | 3702 | { |
michael@0 | 3703 | d = *dst; |
michael@0 | 3704 | mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
michael@0 | 3705 | mmx_dest = expand565_16_1x128 (d); |
michael@0 | 3706 | |
michael@0 | 3707 | *dst = pack_565_32_16 ( |
michael@0 | 3708 | pack_1x128_32 ( |
michael@0 | 3709 | in_over_1x128 ( |
michael@0 | 3710 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
michael@0 | 3711 | } |
michael@0 | 3712 | |
michael@0 | 3713 | w--; |
michael@0 | 3714 | dst++; |
michael@0 | 3715 | } |
michael@0 | 3716 | } |
michael@0 | 3717 | |
michael@0 | 3718 | } |
michael@0 | 3719 | |
michael@0 | 3720 | static void |
michael@0 | 3721 | sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
michael@0 | 3722 | pixman_composite_info_t *info) |
michael@0 | 3723 | { |
michael@0 | 3724 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3725 | uint16_t *dst_line, *dst, d; |
michael@0 | 3726 | uint32_t *src_line, *src, s; |
michael@0 | 3727 | int dst_stride, src_stride; |
michael@0 | 3728 | int32_t w; |
michael@0 | 3729 | uint32_t opaque, zero; |
michael@0 | 3730 | |
michael@0 | 3731 | __m128i ms; |
michael@0 | 3732 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 3733 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
michael@0 | 3734 | |
michael@0 | 3735 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3736 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
michael@0 | 3737 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3738 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 3739 | |
michael@0 | 3740 | while (height--) |
michael@0 | 3741 | { |
michael@0 | 3742 | dst = dst_line; |
michael@0 | 3743 | dst_line += dst_stride; |
michael@0 | 3744 | src = src_line; |
michael@0 | 3745 | src_line += src_stride; |
michael@0 | 3746 | w = width; |
michael@0 | 3747 | |
michael@0 | 3748 | while (w && (uintptr_t)dst & 15) |
michael@0 | 3749 | { |
michael@0 | 3750 | s = *src++; |
michael@0 | 3751 | d = *dst; |
michael@0 | 3752 | |
michael@0 | 3753 | ms = unpack_32_1x128 (s); |
michael@0 | 3754 | |
michael@0 | 3755 | *dst++ = pack_565_32_16 ( |
michael@0 | 3756 | pack_1x128_32 ( |
michael@0 | 3757 | over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); |
michael@0 | 3758 | w--; |
michael@0 | 3759 | } |
michael@0 | 3760 | |
michael@0 | 3761 | while (w >= 8) |
michael@0 | 3762 | { |
michael@0 | 3763 | /* First round */ |
michael@0 | 3764 | xmm_src = load_128_unaligned ((__m128i*)src); |
michael@0 | 3765 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 3766 | |
michael@0 | 3767 | opaque = is_opaque (xmm_src); |
michael@0 | 3768 | zero = is_zero (xmm_src); |
michael@0 | 3769 | |
michael@0 | 3770 | unpack_565_128_4x128 (xmm_dst, |
michael@0 | 3771 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
michael@0 | 3772 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 3773 | |
michael@0 | 3774 | /* preload next round*/ |
michael@0 | 3775 | xmm_src = load_128_unaligned ((__m128i*)(src + 4)); |
michael@0 | 3776 | |
michael@0 | 3777 | if (opaque) |
michael@0 | 3778 | { |
michael@0 | 3779 | invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3780 | &xmm_dst0, &xmm_dst1); |
michael@0 | 3781 | } |
michael@0 | 3782 | else if (!zero) |
michael@0 | 3783 | { |
michael@0 | 3784 | over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3785 | &xmm_dst0, &xmm_dst1); |
michael@0 | 3786 | } |
michael@0 | 3787 | |
michael@0 | 3788 | /* Second round */ |
michael@0 | 3789 | opaque = is_opaque (xmm_src); |
michael@0 | 3790 | zero = is_zero (xmm_src); |
michael@0 | 3791 | |
michael@0 | 3792 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 3793 | |
michael@0 | 3794 | if (opaque) |
michael@0 | 3795 | { |
michael@0 | 3796 | invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3797 | &xmm_dst2, &xmm_dst3); |
michael@0 | 3798 | } |
michael@0 | 3799 | else if (!zero) |
michael@0 | 3800 | { |
michael@0 | 3801 | over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3802 | &xmm_dst2, &xmm_dst3); |
michael@0 | 3803 | } |
michael@0 | 3804 | |
michael@0 | 3805 | save_128_aligned ( |
michael@0 | 3806 | (__m128i*)dst, pack_565_4x128_128 ( |
michael@0 | 3807 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
michael@0 | 3808 | |
michael@0 | 3809 | w -= 8; |
michael@0 | 3810 | src += 8; |
michael@0 | 3811 | dst += 8; |
michael@0 | 3812 | } |
michael@0 | 3813 | |
michael@0 | 3814 | while (w) |
michael@0 | 3815 | { |
michael@0 | 3816 | s = *src++; |
michael@0 | 3817 | d = *dst; |
michael@0 | 3818 | |
michael@0 | 3819 | ms = unpack_32_1x128 (s); |
michael@0 | 3820 | |
michael@0 | 3821 | *dst++ = pack_565_32_16 ( |
michael@0 | 3822 | pack_1x128_32 ( |
michael@0 | 3823 | over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d)))); |
michael@0 | 3824 | w--; |
michael@0 | 3825 | } |
michael@0 | 3826 | } |
michael@0 | 3827 | |
michael@0 | 3828 | } |
michael@0 | 3829 | |
michael@0 | 3830 | static void |
michael@0 | 3831 | sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
michael@0 | 3832 | pixman_composite_info_t *info) |
michael@0 | 3833 | { |
michael@0 | 3834 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3835 | uint32_t *dst_line, *dst, d; |
michael@0 | 3836 | uint32_t *src_line, *src, s; |
michael@0 | 3837 | int dst_stride, src_stride; |
michael@0 | 3838 | int32_t w; |
michael@0 | 3839 | uint32_t opaque, zero; |
michael@0 | 3840 | |
michael@0 | 3841 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 3842 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 3843 | |
michael@0 | 3844 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3845 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 3846 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3847 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 3848 | |
michael@0 | 3849 | while (height--) |
michael@0 | 3850 | { |
michael@0 | 3851 | dst = dst_line; |
michael@0 | 3852 | dst_line += dst_stride; |
michael@0 | 3853 | src = src_line; |
michael@0 | 3854 | src_line += src_stride; |
michael@0 | 3855 | w = width; |
michael@0 | 3856 | |
michael@0 | 3857 | while (w && (uintptr_t)dst & 15) |
michael@0 | 3858 | { |
michael@0 | 3859 | s = *src++; |
michael@0 | 3860 | d = *dst; |
michael@0 | 3861 | |
michael@0 | 3862 | *dst++ = pack_1x128_32 ( |
michael@0 | 3863 | over_rev_non_pre_1x128 ( |
michael@0 | 3864 | unpack_32_1x128 (s), unpack_32_1x128 (d))); |
michael@0 | 3865 | |
michael@0 | 3866 | w--; |
michael@0 | 3867 | } |
michael@0 | 3868 | |
michael@0 | 3869 | while (w >= 4) |
michael@0 | 3870 | { |
michael@0 | 3871 | xmm_src_hi = load_128_unaligned ((__m128i*)src); |
michael@0 | 3872 | |
michael@0 | 3873 | opaque = is_opaque (xmm_src_hi); |
michael@0 | 3874 | zero = is_zero (xmm_src_hi); |
michael@0 | 3875 | |
michael@0 | 3876 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 3877 | |
michael@0 | 3878 | if (opaque) |
michael@0 | 3879 | { |
michael@0 | 3880 | invert_colors_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3881 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3882 | |
michael@0 | 3883 | save_128_aligned ( |
michael@0 | 3884 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 3885 | } |
michael@0 | 3886 | else if (!zero) |
michael@0 | 3887 | { |
michael@0 | 3888 | xmm_dst_hi = load_128_aligned ((__m128i*)dst); |
michael@0 | 3889 | |
michael@0 | 3890 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3891 | |
michael@0 | 3892 | over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 3893 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 3894 | |
michael@0 | 3895 | save_128_aligned ( |
michael@0 | 3896 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 3897 | } |
michael@0 | 3898 | |
michael@0 | 3899 | w -= 4; |
michael@0 | 3900 | dst += 4; |
michael@0 | 3901 | src += 4; |
michael@0 | 3902 | } |
michael@0 | 3903 | |
michael@0 | 3904 | while (w) |
michael@0 | 3905 | { |
michael@0 | 3906 | s = *src++; |
michael@0 | 3907 | d = *dst; |
michael@0 | 3908 | |
michael@0 | 3909 | *dst++ = pack_1x128_32 ( |
michael@0 | 3910 | over_rev_non_pre_1x128 ( |
michael@0 | 3911 | unpack_32_1x128 (s), unpack_32_1x128 (d))); |
michael@0 | 3912 | |
michael@0 | 3913 | w--; |
michael@0 | 3914 | } |
michael@0 | 3915 | } |
michael@0 | 3916 | |
michael@0 | 3917 | } |
michael@0 | 3918 | |
michael@0 | 3919 | static void |
michael@0 | 3920 | sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
michael@0 | 3921 | pixman_composite_info_t *info) |
michael@0 | 3922 | { |
michael@0 | 3923 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 3924 | uint32_t src; |
michael@0 | 3925 | uint16_t *dst_line, *dst, d; |
michael@0 | 3926 | uint32_t *mask_line, *mask, m; |
michael@0 | 3927 | int dst_stride, mask_stride; |
michael@0 | 3928 | int w; |
michael@0 | 3929 | uint32_t pack_cmp; |
michael@0 | 3930 | |
michael@0 | 3931 | __m128i xmm_src, xmm_alpha; |
michael@0 | 3932 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 3933 | __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3; |
michael@0 | 3934 | |
michael@0 | 3935 | __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest; |
michael@0 | 3936 | |
michael@0 | 3937 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 3938 | |
michael@0 | 3939 | if (src == 0) |
michael@0 | 3940 | return; |
michael@0 | 3941 | |
michael@0 | 3942 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3943 | dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
michael@0 | 3944 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 3945 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
michael@0 | 3946 | |
michael@0 | 3947 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 3948 | xmm_alpha = expand_alpha_1x128 (xmm_src); |
michael@0 | 3949 | mmx_src = xmm_src; |
michael@0 | 3950 | mmx_alpha = xmm_alpha; |
michael@0 | 3951 | |
michael@0 | 3952 | while (height--) |
michael@0 | 3953 | { |
michael@0 | 3954 | w = width; |
michael@0 | 3955 | mask = mask_line; |
michael@0 | 3956 | dst = dst_line; |
michael@0 | 3957 | mask_line += mask_stride; |
michael@0 | 3958 | dst_line += dst_stride; |
michael@0 | 3959 | |
michael@0 | 3960 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 3961 | { |
michael@0 | 3962 | m = *(uint32_t *) mask; |
michael@0 | 3963 | |
michael@0 | 3964 | if (m) |
michael@0 | 3965 | { |
michael@0 | 3966 | d = *dst; |
michael@0 | 3967 | mmx_mask = unpack_32_1x128 (m); |
michael@0 | 3968 | mmx_dest = expand565_16_1x128 (d); |
michael@0 | 3969 | |
michael@0 | 3970 | *dst = pack_565_32_16 ( |
michael@0 | 3971 | pack_1x128_32 ( |
michael@0 | 3972 | in_over_1x128 ( |
michael@0 | 3973 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
michael@0 | 3974 | } |
michael@0 | 3975 | |
michael@0 | 3976 | w--; |
michael@0 | 3977 | dst++; |
michael@0 | 3978 | mask++; |
michael@0 | 3979 | } |
michael@0 | 3980 | |
michael@0 | 3981 | while (w >= 8) |
michael@0 | 3982 | { |
michael@0 | 3983 | /* First round */ |
michael@0 | 3984 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
michael@0 | 3985 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 3986 | |
michael@0 | 3987 | pack_cmp = _mm_movemask_epi8 ( |
michael@0 | 3988 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
michael@0 | 3989 | |
michael@0 | 3990 | unpack_565_128_4x128 (xmm_dst, |
michael@0 | 3991 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3); |
michael@0 | 3992 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 3993 | |
michael@0 | 3994 | /* preload next round */ |
michael@0 | 3995 | xmm_mask = load_128_unaligned ((__m128i*)(mask + 4)); |
michael@0 | 3996 | |
michael@0 | 3997 | /* preload next round */ |
michael@0 | 3998 | if (pack_cmp != 0xffff) |
michael@0 | 3999 | { |
michael@0 | 4000 | in_over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 4001 | &xmm_alpha, &xmm_alpha, |
michael@0 | 4002 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4003 | &xmm_dst0, &xmm_dst1); |
michael@0 | 4004 | } |
michael@0 | 4005 | |
michael@0 | 4006 | /* Second round */ |
michael@0 | 4007 | pack_cmp = _mm_movemask_epi8 ( |
michael@0 | 4008 | _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ())); |
michael@0 | 4009 | |
michael@0 | 4010 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4011 | |
michael@0 | 4012 | if (pack_cmp != 0xffff) |
michael@0 | 4013 | { |
michael@0 | 4014 | in_over_2x128 (&xmm_src, &xmm_src, |
michael@0 | 4015 | &xmm_alpha, &xmm_alpha, |
michael@0 | 4016 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4017 | &xmm_dst2, &xmm_dst3); |
michael@0 | 4018 | } |
michael@0 | 4019 | |
michael@0 | 4020 | save_128_aligned ( |
michael@0 | 4021 | (__m128i*)dst, pack_565_4x128_128 ( |
michael@0 | 4022 | &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3)); |
michael@0 | 4023 | |
michael@0 | 4024 | w -= 8; |
michael@0 | 4025 | dst += 8; |
michael@0 | 4026 | mask += 8; |
michael@0 | 4027 | } |
michael@0 | 4028 | |
michael@0 | 4029 | while (w) |
michael@0 | 4030 | { |
michael@0 | 4031 | m = *(uint32_t *) mask; |
michael@0 | 4032 | |
michael@0 | 4033 | if (m) |
michael@0 | 4034 | { |
michael@0 | 4035 | d = *dst; |
michael@0 | 4036 | mmx_mask = unpack_32_1x128 (m); |
michael@0 | 4037 | mmx_dest = expand565_16_1x128 (d); |
michael@0 | 4038 | |
michael@0 | 4039 | *dst = pack_565_32_16 ( |
michael@0 | 4040 | pack_1x128_32 ( |
michael@0 | 4041 | in_over_1x128 ( |
michael@0 | 4042 | &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest))); |
michael@0 | 4043 | } |
michael@0 | 4044 | |
michael@0 | 4045 | w--; |
michael@0 | 4046 | dst++; |
michael@0 | 4047 | mask++; |
michael@0 | 4048 | } |
michael@0 | 4049 | } |
michael@0 | 4050 | |
michael@0 | 4051 | } |
michael@0 | 4052 | |
michael@0 | 4053 | static void |
michael@0 | 4054 | sse2_composite_in_n_8_8 (pixman_implementation_t *imp, |
michael@0 | 4055 | pixman_composite_info_t *info) |
michael@0 | 4056 | { |
michael@0 | 4057 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4058 | uint8_t *dst_line, *dst; |
michael@0 | 4059 | uint8_t *mask_line, *mask; |
michael@0 | 4060 | int dst_stride, mask_stride; |
michael@0 | 4061 | uint32_t d, m; |
michael@0 | 4062 | uint32_t src; |
michael@0 | 4063 | int32_t w; |
michael@0 | 4064 | |
michael@0 | 4065 | __m128i xmm_alpha; |
michael@0 | 4066 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 4067 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4068 | |
michael@0 | 4069 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4070 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
michael@0 | 4071 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4072 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 4073 | |
michael@0 | 4074 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 4075 | |
michael@0 | 4076 | xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
michael@0 | 4077 | |
michael@0 | 4078 | while (height--) |
michael@0 | 4079 | { |
michael@0 | 4080 | dst = dst_line; |
michael@0 | 4081 | dst_line += dst_stride; |
michael@0 | 4082 | mask = mask_line; |
michael@0 | 4083 | mask_line += mask_stride; |
michael@0 | 4084 | w = width; |
michael@0 | 4085 | |
michael@0 | 4086 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 4087 | { |
michael@0 | 4088 | m = (uint32_t) *mask++; |
michael@0 | 4089 | d = (uint32_t) *dst; |
michael@0 | 4090 | |
michael@0 | 4091 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4092 | pix_multiply_1x128 ( |
michael@0 | 4093 | pix_multiply_1x128 (xmm_alpha, |
michael@0 | 4094 | unpack_32_1x128 (m)), |
michael@0 | 4095 | unpack_32_1x128 (d))); |
michael@0 | 4096 | w--; |
michael@0 | 4097 | } |
michael@0 | 4098 | |
michael@0 | 4099 | while (w >= 16) |
michael@0 | 4100 | { |
michael@0 | 4101 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
michael@0 | 4102 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 4103 | |
michael@0 | 4104 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4105 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4106 | |
michael@0 | 4107 | pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
michael@0 | 4108 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4109 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4110 | |
michael@0 | 4111 | pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4112 | &xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 4113 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4114 | |
michael@0 | 4115 | save_128_aligned ( |
michael@0 | 4116 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 4117 | |
michael@0 | 4118 | mask += 16; |
michael@0 | 4119 | dst += 16; |
michael@0 | 4120 | w -= 16; |
michael@0 | 4121 | } |
michael@0 | 4122 | |
michael@0 | 4123 | while (w) |
michael@0 | 4124 | { |
michael@0 | 4125 | m = (uint32_t) *mask++; |
michael@0 | 4126 | d = (uint32_t) *dst; |
michael@0 | 4127 | |
michael@0 | 4128 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4129 | pix_multiply_1x128 ( |
michael@0 | 4130 | pix_multiply_1x128 ( |
michael@0 | 4131 | xmm_alpha, unpack_32_1x128 (m)), |
michael@0 | 4132 | unpack_32_1x128 (d))); |
michael@0 | 4133 | w--; |
michael@0 | 4134 | } |
michael@0 | 4135 | } |
michael@0 | 4136 | |
michael@0 | 4137 | } |
michael@0 | 4138 | |
michael@0 | 4139 | static void |
michael@0 | 4140 | sse2_composite_in_n_8 (pixman_implementation_t *imp, |
michael@0 | 4141 | pixman_composite_info_t *info) |
michael@0 | 4142 | { |
michael@0 | 4143 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4144 | uint8_t *dst_line, *dst; |
michael@0 | 4145 | int dst_stride; |
michael@0 | 4146 | uint32_t d; |
michael@0 | 4147 | uint32_t src; |
michael@0 | 4148 | int32_t w; |
michael@0 | 4149 | |
michael@0 | 4150 | __m128i xmm_alpha; |
michael@0 | 4151 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4152 | |
michael@0 | 4153 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4154 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
michael@0 | 4155 | |
michael@0 | 4156 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 4157 | |
michael@0 | 4158 | xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
michael@0 | 4159 | |
michael@0 | 4160 | src = src >> 24; |
michael@0 | 4161 | |
michael@0 | 4162 | if (src == 0xff) |
michael@0 | 4163 | return; |
michael@0 | 4164 | |
michael@0 | 4165 | if (src == 0x00) |
michael@0 | 4166 | { |
michael@0 | 4167 | pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, |
michael@0 | 4168 | 8, dest_x, dest_y, width, height, src); |
michael@0 | 4169 | |
michael@0 | 4170 | return; |
michael@0 | 4171 | } |
michael@0 | 4172 | |
michael@0 | 4173 | while (height--) |
michael@0 | 4174 | { |
michael@0 | 4175 | dst = dst_line; |
michael@0 | 4176 | dst_line += dst_stride; |
michael@0 | 4177 | w = width; |
michael@0 | 4178 | |
michael@0 | 4179 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 4180 | { |
michael@0 | 4181 | d = (uint32_t) *dst; |
michael@0 | 4182 | |
michael@0 | 4183 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4184 | pix_multiply_1x128 ( |
michael@0 | 4185 | xmm_alpha, |
michael@0 | 4186 | unpack_32_1x128 (d))); |
michael@0 | 4187 | w--; |
michael@0 | 4188 | } |
michael@0 | 4189 | |
michael@0 | 4190 | while (w >= 16) |
michael@0 | 4191 | { |
michael@0 | 4192 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 4193 | |
michael@0 | 4194 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4195 | |
michael@0 | 4196 | pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
michael@0 | 4197 | &xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 4198 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4199 | |
michael@0 | 4200 | save_128_aligned ( |
michael@0 | 4201 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 4202 | |
michael@0 | 4203 | dst += 16; |
michael@0 | 4204 | w -= 16; |
michael@0 | 4205 | } |
michael@0 | 4206 | |
michael@0 | 4207 | while (w) |
michael@0 | 4208 | { |
michael@0 | 4209 | d = (uint32_t) *dst; |
michael@0 | 4210 | |
michael@0 | 4211 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4212 | pix_multiply_1x128 ( |
michael@0 | 4213 | xmm_alpha, |
michael@0 | 4214 | unpack_32_1x128 (d))); |
michael@0 | 4215 | w--; |
michael@0 | 4216 | } |
michael@0 | 4217 | } |
michael@0 | 4218 | |
michael@0 | 4219 | } |
michael@0 | 4220 | |
michael@0 | 4221 | static void |
michael@0 | 4222 | sse2_composite_in_8_8 (pixman_implementation_t *imp, |
michael@0 | 4223 | pixman_composite_info_t *info) |
michael@0 | 4224 | { |
michael@0 | 4225 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4226 | uint8_t *dst_line, *dst; |
michael@0 | 4227 | uint8_t *src_line, *src; |
michael@0 | 4228 | int src_stride, dst_stride; |
michael@0 | 4229 | int32_t w; |
michael@0 | 4230 | uint32_t s, d; |
michael@0 | 4231 | |
michael@0 | 4232 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 4233 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4234 | |
michael@0 | 4235 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4236 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
michael@0 | 4237 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4238 | src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
michael@0 | 4239 | |
michael@0 | 4240 | while (height--) |
michael@0 | 4241 | { |
michael@0 | 4242 | dst = dst_line; |
michael@0 | 4243 | dst_line += dst_stride; |
michael@0 | 4244 | src = src_line; |
michael@0 | 4245 | src_line += src_stride; |
michael@0 | 4246 | w = width; |
michael@0 | 4247 | |
michael@0 | 4248 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 4249 | { |
michael@0 | 4250 | s = (uint32_t) *src++; |
michael@0 | 4251 | d = (uint32_t) *dst; |
michael@0 | 4252 | |
michael@0 | 4253 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4254 | pix_multiply_1x128 ( |
michael@0 | 4255 | unpack_32_1x128 (s), unpack_32_1x128 (d))); |
michael@0 | 4256 | w--; |
michael@0 | 4257 | } |
michael@0 | 4258 | |
michael@0 | 4259 | while (w >= 16) |
michael@0 | 4260 | { |
michael@0 | 4261 | xmm_src = load_128_unaligned ((__m128i*)src); |
michael@0 | 4262 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 4263 | |
michael@0 | 4264 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 4265 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4266 | |
michael@0 | 4267 | pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 4268 | &xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 4269 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4270 | |
michael@0 | 4271 | save_128_aligned ( |
michael@0 | 4272 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 4273 | |
michael@0 | 4274 | src += 16; |
michael@0 | 4275 | dst += 16; |
michael@0 | 4276 | w -= 16; |
michael@0 | 4277 | } |
michael@0 | 4278 | |
michael@0 | 4279 | while (w) |
michael@0 | 4280 | { |
michael@0 | 4281 | s = (uint32_t) *src++; |
michael@0 | 4282 | d = (uint32_t) *dst; |
michael@0 | 4283 | |
michael@0 | 4284 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4285 | pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d))); |
michael@0 | 4286 | w--; |
michael@0 | 4287 | } |
michael@0 | 4288 | } |
michael@0 | 4289 | |
michael@0 | 4290 | } |
michael@0 | 4291 | |
michael@0 | 4292 | static void |
michael@0 | 4293 | sse2_composite_add_n_8_8 (pixman_implementation_t *imp, |
michael@0 | 4294 | pixman_composite_info_t *info) |
michael@0 | 4295 | { |
michael@0 | 4296 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4297 | uint8_t *dst_line, *dst; |
michael@0 | 4298 | uint8_t *mask_line, *mask; |
michael@0 | 4299 | int dst_stride, mask_stride; |
michael@0 | 4300 | int32_t w; |
michael@0 | 4301 | uint32_t src; |
michael@0 | 4302 | uint32_t m, d; |
michael@0 | 4303 | |
michael@0 | 4304 | __m128i xmm_alpha; |
michael@0 | 4305 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 4306 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4307 | |
michael@0 | 4308 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4309 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
michael@0 | 4310 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4311 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 4312 | |
michael@0 | 4313 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 4314 | |
michael@0 | 4315 | xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src)); |
michael@0 | 4316 | |
michael@0 | 4317 | while (height--) |
michael@0 | 4318 | { |
michael@0 | 4319 | dst = dst_line; |
michael@0 | 4320 | dst_line += dst_stride; |
michael@0 | 4321 | mask = mask_line; |
michael@0 | 4322 | mask_line += mask_stride; |
michael@0 | 4323 | w = width; |
michael@0 | 4324 | |
michael@0 | 4325 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 4326 | { |
michael@0 | 4327 | m = (uint32_t) *mask++; |
michael@0 | 4328 | d = (uint32_t) *dst; |
michael@0 | 4329 | |
michael@0 | 4330 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4331 | _mm_adds_epu16 ( |
michael@0 | 4332 | pix_multiply_1x128 ( |
michael@0 | 4333 | xmm_alpha, unpack_32_1x128 (m)), |
michael@0 | 4334 | unpack_32_1x128 (d))); |
michael@0 | 4335 | w--; |
michael@0 | 4336 | } |
michael@0 | 4337 | |
michael@0 | 4338 | while (w >= 16) |
michael@0 | 4339 | { |
michael@0 | 4340 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
michael@0 | 4341 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 4342 | |
michael@0 | 4343 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4344 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4345 | |
michael@0 | 4346 | pix_multiply_2x128 (&xmm_alpha, &xmm_alpha, |
michael@0 | 4347 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4348 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4349 | |
michael@0 | 4350 | xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); |
michael@0 | 4351 | xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); |
michael@0 | 4352 | |
michael@0 | 4353 | save_128_aligned ( |
michael@0 | 4354 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 4355 | |
michael@0 | 4356 | mask += 16; |
michael@0 | 4357 | dst += 16; |
michael@0 | 4358 | w -= 16; |
michael@0 | 4359 | } |
michael@0 | 4360 | |
michael@0 | 4361 | while (w) |
michael@0 | 4362 | { |
michael@0 | 4363 | m = (uint32_t) *mask++; |
michael@0 | 4364 | d = (uint32_t) *dst; |
michael@0 | 4365 | |
michael@0 | 4366 | *dst++ = (uint8_t) pack_1x128_32 ( |
michael@0 | 4367 | _mm_adds_epu16 ( |
michael@0 | 4368 | pix_multiply_1x128 ( |
michael@0 | 4369 | xmm_alpha, unpack_32_1x128 (m)), |
michael@0 | 4370 | unpack_32_1x128 (d))); |
michael@0 | 4371 | |
michael@0 | 4372 | w--; |
michael@0 | 4373 | } |
michael@0 | 4374 | } |
michael@0 | 4375 | |
michael@0 | 4376 | } |
michael@0 | 4377 | |
michael@0 | 4378 | static void |
michael@0 | 4379 | sse2_composite_add_n_8 (pixman_implementation_t *imp, |
michael@0 | 4380 | pixman_composite_info_t *info) |
michael@0 | 4381 | { |
michael@0 | 4382 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4383 | uint8_t *dst_line, *dst; |
michael@0 | 4384 | int dst_stride; |
michael@0 | 4385 | int32_t w; |
michael@0 | 4386 | uint32_t src; |
michael@0 | 4387 | |
michael@0 | 4388 | __m128i xmm_src; |
michael@0 | 4389 | |
michael@0 | 4390 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4391 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
michael@0 | 4392 | |
michael@0 | 4393 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 4394 | |
michael@0 | 4395 | src >>= 24; |
michael@0 | 4396 | |
michael@0 | 4397 | if (src == 0x00) |
michael@0 | 4398 | return; |
michael@0 | 4399 | |
michael@0 | 4400 | if (src == 0xff) |
michael@0 | 4401 | { |
michael@0 | 4402 | pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, |
michael@0 | 4403 | 8, dest_x, dest_y, width, height, 0xff); |
michael@0 | 4404 | |
michael@0 | 4405 | return; |
michael@0 | 4406 | } |
michael@0 | 4407 | |
michael@0 | 4408 | src = (src << 24) | (src << 16) | (src << 8) | src; |
michael@0 | 4409 | xmm_src = _mm_set_epi32 (src, src, src, src); |
michael@0 | 4410 | |
michael@0 | 4411 | while (height--) |
michael@0 | 4412 | { |
michael@0 | 4413 | dst = dst_line; |
michael@0 | 4414 | dst_line += dst_stride; |
michael@0 | 4415 | w = width; |
michael@0 | 4416 | |
michael@0 | 4417 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 4418 | { |
michael@0 | 4419 | *dst = (uint8_t)_mm_cvtsi128_si32 ( |
michael@0 | 4420 | _mm_adds_epu8 ( |
michael@0 | 4421 | xmm_src, |
michael@0 | 4422 | _mm_cvtsi32_si128 (*dst))); |
michael@0 | 4423 | |
michael@0 | 4424 | w--; |
michael@0 | 4425 | dst++; |
michael@0 | 4426 | } |
michael@0 | 4427 | |
michael@0 | 4428 | while (w >= 16) |
michael@0 | 4429 | { |
michael@0 | 4430 | save_128_aligned ( |
michael@0 | 4431 | (__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); |
michael@0 | 4432 | |
michael@0 | 4433 | dst += 16; |
michael@0 | 4434 | w -= 16; |
michael@0 | 4435 | } |
michael@0 | 4436 | |
michael@0 | 4437 | while (w) |
michael@0 | 4438 | { |
michael@0 | 4439 | *dst = (uint8_t)_mm_cvtsi128_si32 ( |
michael@0 | 4440 | _mm_adds_epu8 ( |
michael@0 | 4441 | xmm_src, |
michael@0 | 4442 | _mm_cvtsi32_si128 (*dst))); |
michael@0 | 4443 | |
michael@0 | 4444 | w--; |
michael@0 | 4445 | dst++; |
michael@0 | 4446 | } |
michael@0 | 4447 | } |
michael@0 | 4448 | |
michael@0 | 4449 | } |
michael@0 | 4450 | |
michael@0 | 4451 | static void |
michael@0 | 4452 | sse2_composite_add_8_8 (pixman_implementation_t *imp, |
michael@0 | 4453 | pixman_composite_info_t *info) |
michael@0 | 4454 | { |
michael@0 | 4455 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4456 | uint8_t *dst_line, *dst; |
michael@0 | 4457 | uint8_t *src_line, *src; |
michael@0 | 4458 | int dst_stride, src_stride; |
michael@0 | 4459 | int32_t w; |
michael@0 | 4460 | uint16_t t; |
michael@0 | 4461 | |
michael@0 | 4462 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4463 | src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
michael@0 | 4464 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4465 | dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
michael@0 | 4466 | |
michael@0 | 4467 | while (height--) |
michael@0 | 4468 | { |
michael@0 | 4469 | dst = dst_line; |
michael@0 | 4470 | src = src_line; |
michael@0 | 4471 | |
michael@0 | 4472 | dst_line += dst_stride; |
michael@0 | 4473 | src_line += src_stride; |
michael@0 | 4474 | w = width; |
michael@0 | 4475 | |
michael@0 | 4476 | /* Small head */ |
michael@0 | 4477 | while (w && (uintptr_t)dst & 3) |
michael@0 | 4478 | { |
michael@0 | 4479 | t = (*dst) + (*src++); |
michael@0 | 4480 | *dst++ = t | (0 - (t >> 8)); |
michael@0 | 4481 | w--; |
michael@0 | 4482 | } |
michael@0 | 4483 | |
michael@0 | 4484 | sse2_combine_add_u (imp, op, |
michael@0 | 4485 | (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2); |
michael@0 | 4486 | |
michael@0 | 4487 | /* Small tail */ |
michael@0 | 4488 | dst += w & 0xfffc; |
michael@0 | 4489 | src += w & 0xfffc; |
michael@0 | 4490 | |
michael@0 | 4491 | w &= 3; |
michael@0 | 4492 | |
michael@0 | 4493 | while (w) |
michael@0 | 4494 | { |
michael@0 | 4495 | t = (*dst) + (*src++); |
michael@0 | 4496 | *dst++ = t | (0 - (t >> 8)); |
michael@0 | 4497 | w--; |
michael@0 | 4498 | } |
michael@0 | 4499 | } |
michael@0 | 4500 | |
michael@0 | 4501 | } |
michael@0 | 4502 | |
michael@0 | 4503 | static void |
michael@0 | 4504 | sse2_composite_add_8888_8888 (pixman_implementation_t *imp, |
michael@0 | 4505 | pixman_composite_info_t *info) |
michael@0 | 4506 | { |
michael@0 | 4507 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4508 | uint32_t *dst_line, *dst; |
michael@0 | 4509 | uint32_t *src_line, *src; |
michael@0 | 4510 | int dst_stride, src_stride; |
michael@0 | 4511 | |
michael@0 | 4512 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4513 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 4514 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4515 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 4516 | |
michael@0 | 4517 | while (height--) |
michael@0 | 4518 | { |
michael@0 | 4519 | dst = dst_line; |
michael@0 | 4520 | dst_line += dst_stride; |
michael@0 | 4521 | src = src_line; |
michael@0 | 4522 | src_line += src_stride; |
michael@0 | 4523 | |
michael@0 | 4524 | sse2_combine_add_u (imp, op, dst, src, NULL, width); |
michael@0 | 4525 | } |
michael@0 | 4526 | } |
michael@0 | 4527 | |
michael@0 | 4528 | static void |
michael@0 | 4529 | sse2_composite_add_n_8888 (pixman_implementation_t *imp, |
michael@0 | 4530 | pixman_composite_info_t *info) |
michael@0 | 4531 | { |
michael@0 | 4532 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4533 | uint32_t *dst_line, *dst, src; |
michael@0 | 4534 | int dst_stride; |
michael@0 | 4535 | |
michael@0 | 4536 | __m128i xmm_src; |
michael@0 | 4537 | |
michael@0 | 4538 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 4539 | |
michael@0 | 4540 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 4541 | if (src == 0) |
michael@0 | 4542 | return; |
michael@0 | 4543 | |
michael@0 | 4544 | if (src == ~0) |
michael@0 | 4545 | { |
michael@0 | 4546 | pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32, |
michael@0 | 4547 | dest_x, dest_y, width, height, ~0); |
michael@0 | 4548 | |
michael@0 | 4549 | return; |
michael@0 | 4550 | } |
michael@0 | 4551 | |
michael@0 | 4552 | xmm_src = _mm_set_epi32 (src, src, src, src); |
michael@0 | 4553 | while (height--) |
michael@0 | 4554 | { |
michael@0 | 4555 | int w = width; |
michael@0 | 4556 | uint32_t d; |
michael@0 | 4557 | |
michael@0 | 4558 | dst = dst_line; |
michael@0 | 4559 | dst_line += dst_stride; |
michael@0 | 4560 | |
michael@0 | 4561 | while (w && (unsigned long)dst & 15) |
michael@0 | 4562 | { |
michael@0 | 4563 | d = *dst; |
michael@0 | 4564 | *dst++ = |
michael@0 | 4565 | _mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d))); |
michael@0 | 4566 | w--; |
michael@0 | 4567 | } |
michael@0 | 4568 | |
michael@0 | 4569 | while (w >= 4) |
michael@0 | 4570 | { |
michael@0 | 4571 | save_128_aligned |
michael@0 | 4572 | ((__m128i*)dst, |
michael@0 | 4573 | _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst))); |
michael@0 | 4574 | |
michael@0 | 4575 | dst += 4; |
michael@0 | 4576 | w -= 4; |
michael@0 | 4577 | } |
michael@0 | 4578 | |
michael@0 | 4579 | while (w--) |
michael@0 | 4580 | { |
michael@0 | 4581 | d = *dst; |
michael@0 | 4582 | *dst++ = |
michael@0 | 4583 | _mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src, |
michael@0 | 4584 | _mm_cvtsi32_si128 (d))); |
michael@0 | 4585 | } |
michael@0 | 4586 | } |
michael@0 | 4587 | } |
michael@0 | 4588 | |
michael@0 | 4589 | static void |
michael@0 | 4590 | sse2_composite_add_n_8_8888 (pixman_implementation_t *imp, |
michael@0 | 4591 | pixman_composite_info_t *info) |
michael@0 | 4592 | { |
michael@0 | 4593 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4594 | uint32_t *dst_line, *dst; |
michael@0 | 4595 | uint8_t *mask_line, *mask; |
michael@0 | 4596 | int dst_stride, mask_stride; |
michael@0 | 4597 | int32_t w; |
michael@0 | 4598 | uint32_t src; |
michael@0 | 4599 | |
michael@0 | 4600 | __m128i xmm_src; |
michael@0 | 4601 | |
michael@0 | 4602 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 4603 | if (src == 0) |
michael@0 | 4604 | return; |
michael@0 | 4605 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 4606 | |
michael@0 | 4607 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4608 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 4609 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4610 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 4611 | |
michael@0 | 4612 | while (height--) |
michael@0 | 4613 | { |
michael@0 | 4614 | dst = dst_line; |
michael@0 | 4615 | dst_line += dst_stride; |
michael@0 | 4616 | mask = mask_line; |
michael@0 | 4617 | mask_line += mask_stride; |
michael@0 | 4618 | w = width; |
michael@0 | 4619 | |
michael@0 | 4620 | while (w && ((unsigned long)dst & 15)) |
michael@0 | 4621 | { |
michael@0 | 4622 | uint8_t m = *mask++; |
michael@0 | 4623 | if (m) |
michael@0 | 4624 | { |
michael@0 | 4625 | *dst = pack_1x128_32 |
michael@0 | 4626 | (_mm_adds_epu16 |
michael@0 | 4627 | (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), |
michael@0 | 4628 | unpack_32_1x128 (*dst))); |
michael@0 | 4629 | } |
michael@0 | 4630 | dst++; |
michael@0 | 4631 | w--; |
michael@0 | 4632 | } |
michael@0 | 4633 | |
michael@0 | 4634 | while (w >= 4) |
michael@0 | 4635 | { |
michael@0 | 4636 | uint32_t m = *(uint32_t*)mask; |
michael@0 | 4637 | if (m) |
michael@0 | 4638 | { |
michael@0 | 4639 | __m128i xmm_mask_lo, xmm_mask_hi; |
michael@0 | 4640 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4641 | |
michael@0 | 4642 | __m128i xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 4643 | __m128i xmm_mask = |
michael@0 | 4644 | _mm_unpacklo_epi8 (unpack_32_1x128(m), |
michael@0 | 4645 | _mm_setzero_si128 ()); |
michael@0 | 4646 | |
michael@0 | 4647 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4648 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4649 | |
michael@0 | 4650 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, |
michael@0 | 4651 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4652 | |
michael@0 | 4653 | pix_multiply_2x128 (&xmm_src, &xmm_src, |
michael@0 | 4654 | &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4655 | &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4656 | |
michael@0 | 4657 | xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo); |
michael@0 | 4658 | xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi); |
michael@0 | 4659 | |
michael@0 | 4660 | save_128_aligned ( |
michael@0 | 4661 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 4662 | } |
michael@0 | 4663 | |
michael@0 | 4664 | w -= 4; |
michael@0 | 4665 | dst += 4; |
michael@0 | 4666 | mask += 4; |
michael@0 | 4667 | } |
michael@0 | 4668 | |
michael@0 | 4669 | while (w) |
michael@0 | 4670 | { |
michael@0 | 4671 | uint8_t m = *mask++; |
michael@0 | 4672 | if (m) |
michael@0 | 4673 | { |
michael@0 | 4674 | *dst = pack_1x128_32 |
michael@0 | 4675 | (_mm_adds_epu16 |
michael@0 | 4676 | (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)), |
michael@0 | 4677 | unpack_32_1x128 (*dst))); |
michael@0 | 4678 | } |
michael@0 | 4679 | dst++; |
michael@0 | 4680 | w--; |
michael@0 | 4681 | } |
michael@0 | 4682 | } |
michael@0 | 4683 | } |
michael@0 | 4684 | |
michael@0 | 4685 | static pixman_bool_t |
michael@0 | 4686 | sse2_blt (pixman_implementation_t *imp, |
michael@0 | 4687 | uint32_t * src_bits, |
michael@0 | 4688 | uint32_t * dst_bits, |
michael@0 | 4689 | int src_stride, |
michael@0 | 4690 | int dst_stride, |
michael@0 | 4691 | int src_bpp, |
michael@0 | 4692 | int dst_bpp, |
michael@0 | 4693 | int src_x, |
michael@0 | 4694 | int src_y, |
michael@0 | 4695 | int dest_x, |
michael@0 | 4696 | int dest_y, |
michael@0 | 4697 | int width, |
michael@0 | 4698 | int height) |
michael@0 | 4699 | { |
michael@0 | 4700 | uint8_t * src_bytes; |
michael@0 | 4701 | uint8_t * dst_bytes; |
michael@0 | 4702 | int byte_width; |
michael@0 | 4703 | |
michael@0 | 4704 | if (src_bpp != dst_bpp) |
michael@0 | 4705 | return FALSE; |
michael@0 | 4706 | |
michael@0 | 4707 | if (src_bpp == 16) |
michael@0 | 4708 | { |
michael@0 | 4709 | src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
michael@0 | 4710 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
michael@0 | 4711 | src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
michael@0 | 4712 | dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
michael@0 | 4713 | byte_width = 2 * width; |
michael@0 | 4714 | src_stride *= 2; |
michael@0 | 4715 | dst_stride *= 2; |
michael@0 | 4716 | } |
michael@0 | 4717 | else if (src_bpp == 32) |
michael@0 | 4718 | { |
michael@0 | 4719 | src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
michael@0 | 4720 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
michael@0 | 4721 | src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
michael@0 | 4722 | dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
michael@0 | 4723 | byte_width = 4 * width; |
michael@0 | 4724 | src_stride *= 4; |
michael@0 | 4725 | dst_stride *= 4; |
michael@0 | 4726 | } |
michael@0 | 4727 | else |
michael@0 | 4728 | { |
michael@0 | 4729 | return FALSE; |
michael@0 | 4730 | } |
michael@0 | 4731 | |
michael@0 | 4732 | while (height--) |
michael@0 | 4733 | { |
michael@0 | 4734 | int w; |
michael@0 | 4735 | uint8_t *s = src_bytes; |
michael@0 | 4736 | uint8_t *d = dst_bytes; |
michael@0 | 4737 | src_bytes += src_stride; |
michael@0 | 4738 | dst_bytes += dst_stride; |
michael@0 | 4739 | w = byte_width; |
michael@0 | 4740 | |
michael@0 | 4741 | while (w >= 2 && ((uintptr_t)d & 3)) |
michael@0 | 4742 | { |
michael@0 | 4743 | *(uint16_t *)d = *(uint16_t *)s; |
michael@0 | 4744 | w -= 2; |
michael@0 | 4745 | s += 2; |
michael@0 | 4746 | d += 2; |
michael@0 | 4747 | } |
michael@0 | 4748 | |
michael@0 | 4749 | while (w >= 4 && ((uintptr_t)d & 15)) |
michael@0 | 4750 | { |
michael@0 | 4751 | *(uint32_t *)d = *(uint32_t *)s; |
michael@0 | 4752 | |
michael@0 | 4753 | w -= 4; |
michael@0 | 4754 | s += 4; |
michael@0 | 4755 | d += 4; |
michael@0 | 4756 | } |
michael@0 | 4757 | |
michael@0 | 4758 | while (w >= 64) |
michael@0 | 4759 | { |
michael@0 | 4760 | __m128i xmm0, xmm1, xmm2, xmm3; |
michael@0 | 4761 | |
michael@0 | 4762 | xmm0 = load_128_unaligned ((__m128i*)(s)); |
michael@0 | 4763 | xmm1 = load_128_unaligned ((__m128i*)(s + 16)); |
michael@0 | 4764 | xmm2 = load_128_unaligned ((__m128i*)(s + 32)); |
michael@0 | 4765 | xmm3 = load_128_unaligned ((__m128i*)(s + 48)); |
michael@0 | 4766 | |
michael@0 | 4767 | save_128_aligned ((__m128i*)(d), xmm0); |
michael@0 | 4768 | save_128_aligned ((__m128i*)(d + 16), xmm1); |
michael@0 | 4769 | save_128_aligned ((__m128i*)(d + 32), xmm2); |
michael@0 | 4770 | save_128_aligned ((__m128i*)(d + 48), xmm3); |
michael@0 | 4771 | |
michael@0 | 4772 | s += 64; |
michael@0 | 4773 | d += 64; |
michael@0 | 4774 | w -= 64; |
michael@0 | 4775 | } |
michael@0 | 4776 | |
michael@0 | 4777 | while (w >= 16) |
michael@0 | 4778 | { |
michael@0 | 4779 | save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) ); |
michael@0 | 4780 | |
michael@0 | 4781 | w -= 16; |
michael@0 | 4782 | d += 16; |
michael@0 | 4783 | s += 16; |
michael@0 | 4784 | } |
michael@0 | 4785 | |
michael@0 | 4786 | while (w >= 4) |
michael@0 | 4787 | { |
michael@0 | 4788 | *(uint32_t *)d = *(uint32_t *)s; |
michael@0 | 4789 | |
michael@0 | 4790 | w -= 4; |
michael@0 | 4791 | s += 4; |
michael@0 | 4792 | d += 4; |
michael@0 | 4793 | } |
michael@0 | 4794 | |
michael@0 | 4795 | if (w >= 2) |
michael@0 | 4796 | { |
michael@0 | 4797 | *(uint16_t *)d = *(uint16_t *)s; |
michael@0 | 4798 | w -= 2; |
michael@0 | 4799 | s += 2; |
michael@0 | 4800 | d += 2; |
michael@0 | 4801 | } |
michael@0 | 4802 | } |
michael@0 | 4803 | |
michael@0 | 4804 | return TRUE; |
michael@0 | 4805 | } |
michael@0 | 4806 | |
michael@0 | 4807 | static void |
michael@0 | 4808 | sse2_composite_copy_area (pixman_implementation_t *imp, |
michael@0 | 4809 | pixman_composite_info_t *info) |
michael@0 | 4810 | { |
michael@0 | 4811 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4812 | sse2_blt (imp, src_image->bits.bits, |
michael@0 | 4813 | dest_image->bits.bits, |
michael@0 | 4814 | src_image->bits.rowstride, |
michael@0 | 4815 | dest_image->bits.rowstride, |
michael@0 | 4816 | PIXMAN_FORMAT_BPP (src_image->bits.format), |
michael@0 | 4817 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
michael@0 | 4818 | src_x, src_y, dest_x, dest_y, width, height); |
michael@0 | 4819 | } |
michael@0 | 4820 | |
michael@0 | 4821 | static void |
michael@0 | 4822 | sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
michael@0 | 4823 | pixman_composite_info_t *info) |
michael@0 | 4824 | { |
michael@0 | 4825 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4826 | uint32_t *src, *src_line, s; |
michael@0 | 4827 | uint32_t *dst, *dst_line, d; |
michael@0 | 4828 | uint8_t *mask, *mask_line; |
michael@0 | 4829 | uint32_t m; |
michael@0 | 4830 | int src_stride, mask_stride, dst_stride; |
michael@0 | 4831 | int32_t w; |
michael@0 | 4832 | __m128i ms; |
michael@0 | 4833 | |
michael@0 | 4834 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 4835 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4836 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 4837 | |
michael@0 | 4838 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4839 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 4840 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4841 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 4842 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4843 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 4844 | |
michael@0 | 4845 | while (height--) |
michael@0 | 4846 | { |
michael@0 | 4847 | src = src_line; |
michael@0 | 4848 | src_line += src_stride; |
michael@0 | 4849 | dst = dst_line; |
michael@0 | 4850 | dst_line += dst_stride; |
michael@0 | 4851 | mask = mask_line; |
michael@0 | 4852 | mask_line += mask_stride; |
michael@0 | 4853 | |
michael@0 | 4854 | w = width; |
michael@0 | 4855 | |
michael@0 | 4856 | while (w && (uintptr_t)dst & 15) |
michael@0 | 4857 | { |
michael@0 | 4858 | s = 0xff000000 | *src++; |
michael@0 | 4859 | m = (uint32_t) *mask++; |
michael@0 | 4860 | d = *dst; |
michael@0 | 4861 | ms = unpack_32_1x128 (s); |
michael@0 | 4862 | |
michael@0 | 4863 | if (m != 0xff) |
michael@0 | 4864 | { |
michael@0 | 4865 | __m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
michael@0 | 4866 | __m128i md = unpack_32_1x128 (d); |
michael@0 | 4867 | |
michael@0 | 4868 | ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md); |
michael@0 | 4869 | } |
michael@0 | 4870 | |
michael@0 | 4871 | *dst++ = pack_1x128_32 (ms); |
michael@0 | 4872 | w--; |
michael@0 | 4873 | } |
michael@0 | 4874 | |
michael@0 | 4875 | while (w >= 4) |
michael@0 | 4876 | { |
michael@0 | 4877 | m = *(uint32_t*) mask; |
michael@0 | 4878 | xmm_src = _mm_or_si128 ( |
michael@0 | 4879 | load_128_unaligned ((__m128i*)src), mask_ff000000); |
michael@0 | 4880 | |
michael@0 | 4881 | if (m == 0xffffffff) |
michael@0 | 4882 | { |
michael@0 | 4883 | save_128_aligned ((__m128i*)dst, xmm_src); |
michael@0 | 4884 | } |
michael@0 | 4885 | else |
michael@0 | 4886 | { |
michael@0 | 4887 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 4888 | |
michael@0 | 4889 | xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
michael@0 | 4890 | |
michael@0 | 4891 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 4892 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4893 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4894 | |
michael@0 | 4895 | expand_alpha_rev_2x128 ( |
michael@0 | 4896 | xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 4897 | |
michael@0 | 4898 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 4899 | &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi, |
michael@0 | 4900 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 4901 | |
michael@0 | 4902 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 4903 | } |
michael@0 | 4904 | |
michael@0 | 4905 | src += 4; |
michael@0 | 4906 | dst += 4; |
michael@0 | 4907 | mask += 4; |
michael@0 | 4908 | w -= 4; |
michael@0 | 4909 | } |
michael@0 | 4910 | |
michael@0 | 4911 | while (w) |
michael@0 | 4912 | { |
michael@0 | 4913 | m = (uint32_t) *mask++; |
michael@0 | 4914 | |
michael@0 | 4915 | if (m) |
michael@0 | 4916 | { |
michael@0 | 4917 | s = 0xff000000 | *src; |
michael@0 | 4918 | |
michael@0 | 4919 | if (m == 0xff) |
michael@0 | 4920 | { |
michael@0 | 4921 | *dst = s; |
michael@0 | 4922 | } |
michael@0 | 4923 | else |
michael@0 | 4924 | { |
michael@0 | 4925 | __m128i ma, md, ms; |
michael@0 | 4926 | |
michael@0 | 4927 | d = *dst; |
michael@0 | 4928 | |
michael@0 | 4929 | ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m)); |
michael@0 | 4930 | md = unpack_32_1x128 (d); |
michael@0 | 4931 | ms = unpack_32_1x128 (s); |
michael@0 | 4932 | |
michael@0 | 4933 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md)); |
michael@0 | 4934 | } |
michael@0 | 4935 | |
michael@0 | 4936 | } |
michael@0 | 4937 | |
michael@0 | 4938 | src++; |
michael@0 | 4939 | dst++; |
michael@0 | 4940 | w--; |
michael@0 | 4941 | } |
michael@0 | 4942 | } |
michael@0 | 4943 | |
michael@0 | 4944 | } |
michael@0 | 4945 | |
michael@0 | 4946 | static void |
michael@0 | 4947 | sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp, |
michael@0 | 4948 | pixman_composite_info_t *info) |
michael@0 | 4949 | { |
michael@0 | 4950 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 4951 | uint32_t *src, *src_line, s; |
michael@0 | 4952 | uint32_t *dst, *dst_line, d; |
michael@0 | 4953 | uint8_t *mask, *mask_line; |
michael@0 | 4954 | uint32_t m; |
michael@0 | 4955 | int src_stride, mask_stride, dst_stride; |
michael@0 | 4956 | int32_t w; |
michael@0 | 4957 | |
michael@0 | 4958 | __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
michael@0 | 4959 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 4960 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 4961 | |
michael@0 | 4962 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4963 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 4964 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4965 | mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
michael@0 | 4966 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 4967 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 4968 | |
michael@0 | 4969 | while (height--) |
michael@0 | 4970 | { |
michael@0 | 4971 | src = src_line; |
michael@0 | 4972 | src_line += src_stride; |
michael@0 | 4973 | dst = dst_line; |
michael@0 | 4974 | dst_line += dst_stride; |
michael@0 | 4975 | mask = mask_line; |
michael@0 | 4976 | mask_line += mask_stride; |
michael@0 | 4977 | |
michael@0 | 4978 | w = width; |
michael@0 | 4979 | |
michael@0 | 4980 | while (w && (uintptr_t)dst & 15) |
michael@0 | 4981 | { |
michael@0 | 4982 | uint32_t sa; |
michael@0 | 4983 | |
michael@0 | 4984 | s = *src++; |
michael@0 | 4985 | m = (uint32_t) *mask++; |
michael@0 | 4986 | d = *dst; |
michael@0 | 4987 | |
michael@0 | 4988 | sa = s >> 24; |
michael@0 | 4989 | |
michael@0 | 4990 | if (m) |
michael@0 | 4991 | { |
michael@0 | 4992 | if (sa == 0xff && m == 0xff) |
michael@0 | 4993 | { |
michael@0 | 4994 | *dst = s; |
michael@0 | 4995 | } |
michael@0 | 4996 | else |
michael@0 | 4997 | { |
michael@0 | 4998 | __m128i ms, md, ma, msa; |
michael@0 | 4999 | |
michael@0 | 5000 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
michael@0 | 5001 | ms = unpack_32_1x128 (s); |
michael@0 | 5002 | md = unpack_32_1x128 (d); |
michael@0 | 5003 | |
michael@0 | 5004 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
michael@0 | 5005 | |
michael@0 | 5006 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
michael@0 | 5007 | } |
michael@0 | 5008 | } |
michael@0 | 5009 | |
michael@0 | 5010 | dst++; |
michael@0 | 5011 | w--; |
michael@0 | 5012 | } |
michael@0 | 5013 | |
michael@0 | 5014 | while (w >= 4) |
michael@0 | 5015 | { |
michael@0 | 5016 | m = *(uint32_t *) mask; |
michael@0 | 5017 | |
michael@0 | 5018 | if (m) |
michael@0 | 5019 | { |
michael@0 | 5020 | xmm_src = load_128_unaligned ((__m128i*)src); |
michael@0 | 5021 | |
michael@0 | 5022 | if (m == 0xffffffff && is_opaque (xmm_src)) |
michael@0 | 5023 | { |
michael@0 | 5024 | save_128_aligned ((__m128i *)dst, xmm_src); |
michael@0 | 5025 | } |
michael@0 | 5026 | else |
michael@0 | 5027 | { |
michael@0 | 5028 | xmm_dst = load_128_aligned ((__m128i *)dst); |
michael@0 | 5029 | |
michael@0 | 5030 | xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
michael@0 | 5031 | |
michael@0 | 5032 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 5033 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 5034 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5035 | |
michael@0 | 5036 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
michael@0 | 5037 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 5038 | |
michael@0 | 5039 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
michael@0 | 5040 | &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5041 | |
michael@0 | 5042 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 5043 | } |
michael@0 | 5044 | } |
michael@0 | 5045 | |
michael@0 | 5046 | src += 4; |
michael@0 | 5047 | dst += 4; |
michael@0 | 5048 | mask += 4; |
michael@0 | 5049 | w -= 4; |
michael@0 | 5050 | } |
michael@0 | 5051 | |
michael@0 | 5052 | while (w) |
michael@0 | 5053 | { |
michael@0 | 5054 | uint32_t sa; |
michael@0 | 5055 | |
michael@0 | 5056 | s = *src++; |
michael@0 | 5057 | m = (uint32_t) *mask++; |
michael@0 | 5058 | d = *dst; |
michael@0 | 5059 | |
michael@0 | 5060 | sa = s >> 24; |
michael@0 | 5061 | |
michael@0 | 5062 | if (m) |
michael@0 | 5063 | { |
michael@0 | 5064 | if (sa == 0xff && m == 0xff) |
michael@0 | 5065 | { |
michael@0 | 5066 | *dst = s; |
michael@0 | 5067 | } |
michael@0 | 5068 | else |
michael@0 | 5069 | { |
michael@0 | 5070 | __m128i ms, md, ma, msa; |
michael@0 | 5071 | |
michael@0 | 5072 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
michael@0 | 5073 | ms = unpack_32_1x128 (s); |
michael@0 | 5074 | md = unpack_32_1x128 (d); |
michael@0 | 5075 | |
michael@0 | 5076 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
michael@0 | 5077 | |
michael@0 | 5078 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
michael@0 | 5079 | } |
michael@0 | 5080 | } |
michael@0 | 5081 | |
michael@0 | 5082 | dst++; |
michael@0 | 5083 | w--; |
michael@0 | 5084 | } |
michael@0 | 5085 | } |
michael@0 | 5086 | |
michael@0 | 5087 | } |
michael@0 | 5088 | |
michael@0 | 5089 | static void |
michael@0 | 5090 | sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
michael@0 | 5091 | pixman_composite_info_t *info) |
michael@0 | 5092 | { |
michael@0 | 5093 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 5094 | uint32_t src; |
michael@0 | 5095 | uint32_t *dst_line, *dst; |
michael@0 | 5096 | __m128i xmm_src; |
michael@0 | 5097 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 5098 | __m128i xmm_dsta_hi, xmm_dsta_lo; |
michael@0 | 5099 | int dst_stride; |
michael@0 | 5100 | int32_t w; |
michael@0 | 5101 | |
michael@0 | 5102 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
michael@0 | 5103 | |
michael@0 | 5104 | if (src == 0) |
michael@0 | 5105 | return; |
michael@0 | 5106 | |
michael@0 | 5107 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 5108 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 5109 | |
michael@0 | 5110 | xmm_src = expand_pixel_32_1x128 (src); |
michael@0 | 5111 | |
michael@0 | 5112 | while (height--) |
michael@0 | 5113 | { |
michael@0 | 5114 | dst = dst_line; |
michael@0 | 5115 | |
michael@0 | 5116 | dst_line += dst_stride; |
michael@0 | 5117 | w = width; |
michael@0 | 5118 | |
michael@0 | 5119 | while (w && (uintptr_t)dst & 15) |
michael@0 | 5120 | { |
michael@0 | 5121 | __m128i vd; |
michael@0 | 5122 | |
michael@0 | 5123 | vd = unpack_32_1x128 (*dst); |
michael@0 | 5124 | |
michael@0 | 5125 | *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), |
michael@0 | 5126 | xmm_src)); |
michael@0 | 5127 | w--; |
michael@0 | 5128 | dst++; |
michael@0 | 5129 | } |
michael@0 | 5130 | |
michael@0 | 5131 | while (w >= 4) |
michael@0 | 5132 | { |
michael@0 | 5133 | __m128i tmp_lo, tmp_hi; |
michael@0 | 5134 | |
michael@0 | 5135 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 5136 | |
michael@0 | 5137 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5138 | expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi); |
michael@0 | 5139 | |
michael@0 | 5140 | tmp_lo = xmm_src; |
michael@0 | 5141 | tmp_hi = xmm_src; |
michael@0 | 5142 | |
michael@0 | 5143 | over_2x128 (&xmm_dst_lo, &xmm_dst_hi, |
michael@0 | 5144 | &xmm_dsta_lo, &xmm_dsta_hi, |
michael@0 | 5145 | &tmp_lo, &tmp_hi); |
michael@0 | 5146 | |
michael@0 | 5147 | save_128_aligned ( |
michael@0 | 5148 | (__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi)); |
michael@0 | 5149 | |
michael@0 | 5150 | w -= 4; |
michael@0 | 5151 | dst += 4; |
michael@0 | 5152 | } |
michael@0 | 5153 | |
michael@0 | 5154 | while (w) |
michael@0 | 5155 | { |
michael@0 | 5156 | __m128i vd; |
michael@0 | 5157 | |
michael@0 | 5158 | vd = unpack_32_1x128 (*dst); |
michael@0 | 5159 | |
michael@0 | 5160 | *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd), |
michael@0 | 5161 | xmm_src)); |
michael@0 | 5162 | w--; |
michael@0 | 5163 | dst++; |
michael@0 | 5164 | } |
michael@0 | 5165 | |
michael@0 | 5166 | } |
michael@0 | 5167 | |
michael@0 | 5168 | } |
michael@0 | 5169 | |
michael@0 | 5170 | static void |
michael@0 | 5171 | sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp, |
michael@0 | 5172 | pixman_composite_info_t *info) |
michael@0 | 5173 | { |
michael@0 | 5174 | PIXMAN_COMPOSITE_ARGS (info); |
michael@0 | 5175 | uint32_t *src, *src_line, s; |
michael@0 | 5176 | uint32_t *dst, *dst_line, d; |
michael@0 | 5177 | uint32_t *mask, *mask_line; |
michael@0 | 5178 | uint32_t m; |
michael@0 | 5179 | int src_stride, mask_stride, dst_stride; |
michael@0 | 5180 | int32_t w; |
michael@0 | 5181 | |
michael@0 | 5182 | __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
michael@0 | 5183 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 5184 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 5185 | |
michael@0 | 5186 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 5187 | dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
michael@0 | 5188 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 5189 | mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
michael@0 | 5190 | PIXMAN_IMAGE_GET_LINE ( |
michael@0 | 5191 | src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
michael@0 | 5192 | |
michael@0 | 5193 | while (height--) |
michael@0 | 5194 | { |
michael@0 | 5195 | src = src_line; |
michael@0 | 5196 | src_line += src_stride; |
michael@0 | 5197 | dst = dst_line; |
michael@0 | 5198 | dst_line += dst_stride; |
michael@0 | 5199 | mask = mask_line; |
michael@0 | 5200 | mask_line += mask_stride; |
michael@0 | 5201 | |
michael@0 | 5202 | w = width; |
michael@0 | 5203 | |
michael@0 | 5204 | while (w && (uintptr_t)dst & 15) |
michael@0 | 5205 | { |
michael@0 | 5206 | uint32_t sa; |
michael@0 | 5207 | |
michael@0 | 5208 | s = *src++; |
michael@0 | 5209 | m = (*mask++) >> 24; |
michael@0 | 5210 | d = *dst; |
michael@0 | 5211 | |
michael@0 | 5212 | sa = s >> 24; |
michael@0 | 5213 | |
michael@0 | 5214 | if (m) |
michael@0 | 5215 | { |
michael@0 | 5216 | if (sa == 0xff && m == 0xff) |
michael@0 | 5217 | { |
michael@0 | 5218 | *dst = s; |
michael@0 | 5219 | } |
michael@0 | 5220 | else |
michael@0 | 5221 | { |
michael@0 | 5222 | __m128i ms, md, ma, msa; |
michael@0 | 5223 | |
michael@0 | 5224 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
michael@0 | 5225 | ms = unpack_32_1x128 (s); |
michael@0 | 5226 | md = unpack_32_1x128 (d); |
michael@0 | 5227 | |
michael@0 | 5228 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
michael@0 | 5229 | |
michael@0 | 5230 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
michael@0 | 5231 | } |
michael@0 | 5232 | } |
michael@0 | 5233 | |
michael@0 | 5234 | dst++; |
michael@0 | 5235 | w--; |
michael@0 | 5236 | } |
michael@0 | 5237 | |
michael@0 | 5238 | while (w >= 4) |
michael@0 | 5239 | { |
michael@0 | 5240 | xmm_mask = load_128_unaligned ((__m128i*)mask); |
michael@0 | 5241 | |
michael@0 | 5242 | if (!is_transparent (xmm_mask)) |
michael@0 | 5243 | { |
michael@0 | 5244 | xmm_src = load_128_unaligned ((__m128i*)src); |
michael@0 | 5245 | |
michael@0 | 5246 | if (is_opaque (xmm_mask) && is_opaque (xmm_src)) |
michael@0 | 5247 | { |
michael@0 | 5248 | save_128_aligned ((__m128i *)dst, xmm_src); |
michael@0 | 5249 | } |
michael@0 | 5250 | else |
michael@0 | 5251 | { |
michael@0 | 5252 | xmm_dst = load_128_aligned ((__m128i *)dst); |
michael@0 | 5253 | |
michael@0 | 5254 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 5255 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 5256 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5257 | |
michael@0 | 5258 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
michael@0 | 5259 | expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 5260 | |
michael@0 | 5261 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
michael@0 | 5262 | &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5263 | |
michael@0 | 5264 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 5265 | } |
michael@0 | 5266 | } |
michael@0 | 5267 | |
michael@0 | 5268 | src += 4; |
michael@0 | 5269 | dst += 4; |
michael@0 | 5270 | mask += 4; |
michael@0 | 5271 | w -= 4; |
michael@0 | 5272 | } |
michael@0 | 5273 | |
michael@0 | 5274 | while (w) |
michael@0 | 5275 | { |
michael@0 | 5276 | uint32_t sa; |
michael@0 | 5277 | |
michael@0 | 5278 | s = *src++; |
michael@0 | 5279 | m = (*mask++) >> 24; |
michael@0 | 5280 | d = *dst; |
michael@0 | 5281 | |
michael@0 | 5282 | sa = s >> 24; |
michael@0 | 5283 | |
michael@0 | 5284 | if (m) |
michael@0 | 5285 | { |
michael@0 | 5286 | if (sa == 0xff && m == 0xff) |
michael@0 | 5287 | { |
michael@0 | 5288 | *dst = s; |
michael@0 | 5289 | } |
michael@0 | 5290 | else |
michael@0 | 5291 | { |
michael@0 | 5292 | __m128i ms, md, ma, msa; |
michael@0 | 5293 | |
michael@0 | 5294 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
michael@0 | 5295 | ms = unpack_32_1x128 (s); |
michael@0 | 5296 | md = unpack_32_1x128 (d); |
michael@0 | 5297 | |
michael@0 | 5298 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
michael@0 | 5299 | |
michael@0 | 5300 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
michael@0 | 5301 | } |
michael@0 | 5302 | } |
michael@0 | 5303 | |
michael@0 | 5304 | dst++; |
michael@0 | 5305 | w--; |
michael@0 | 5306 | } |
michael@0 | 5307 | } |
michael@0 | 5308 | |
michael@0 | 5309 | } |
michael@0 | 5310 | |
michael@0 | 5311 | /* A variant of 'sse2_combine_over_u' with minor tweaks */ |
michael@0 | 5312 | static force_inline void |
michael@0 | 5313 | scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd, |
michael@0 | 5314 | const uint32_t* ps, |
michael@0 | 5315 | int32_t w, |
michael@0 | 5316 | pixman_fixed_t vx, |
michael@0 | 5317 | pixman_fixed_t unit_x, |
michael@0 | 5318 | pixman_fixed_t src_width_fixed, |
michael@0 | 5319 | pixman_bool_t fully_transparent_src) |
michael@0 | 5320 | { |
michael@0 | 5321 | uint32_t s, d; |
michael@0 | 5322 | const uint32_t* pm = NULL; |
michael@0 | 5323 | |
michael@0 | 5324 | __m128i xmm_dst_lo, xmm_dst_hi; |
michael@0 | 5325 | __m128i xmm_src_lo, xmm_src_hi; |
michael@0 | 5326 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 5327 | |
michael@0 | 5328 | if (fully_transparent_src) |
michael@0 | 5329 | return; |
michael@0 | 5330 | |
michael@0 | 5331 | /* Align dst on a 16-byte boundary */ |
michael@0 | 5332 | while (w && ((uintptr_t)pd & 15)) |
michael@0 | 5333 | { |
michael@0 | 5334 | d = *pd; |
michael@0 | 5335 | s = combine1 (ps + pixman_fixed_to_int (vx), pm); |
michael@0 | 5336 | vx += unit_x; |
michael@0 | 5337 | while (vx >= 0) |
michael@0 | 5338 | vx -= src_width_fixed; |
michael@0 | 5339 | |
michael@0 | 5340 | *pd++ = core_combine_over_u_pixel_sse2 (s, d); |
michael@0 | 5341 | if (pm) |
michael@0 | 5342 | pm++; |
michael@0 | 5343 | w--; |
michael@0 | 5344 | } |
michael@0 | 5345 | |
michael@0 | 5346 | while (w >= 4) |
michael@0 | 5347 | { |
michael@0 | 5348 | __m128i tmp; |
michael@0 | 5349 | uint32_t tmp1, tmp2, tmp3, tmp4; |
michael@0 | 5350 | |
michael@0 | 5351 | tmp1 = *(ps + pixman_fixed_to_int (vx)); |
michael@0 | 5352 | vx += unit_x; |
michael@0 | 5353 | while (vx >= 0) |
michael@0 | 5354 | vx -= src_width_fixed; |
michael@0 | 5355 | tmp2 = *(ps + pixman_fixed_to_int (vx)); |
michael@0 | 5356 | vx += unit_x; |
michael@0 | 5357 | while (vx >= 0) |
michael@0 | 5358 | vx -= src_width_fixed; |
michael@0 | 5359 | tmp3 = *(ps + pixman_fixed_to_int (vx)); |
michael@0 | 5360 | vx += unit_x; |
michael@0 | 5361 | while (vx >= 0) |
michael@0 | 5362 | vx -= src_width_fixed; |
michael@0 | 5363 | tmp4 = *(ps + pixman_fixed_to_int (vx)); |
michael@0 | 5364 | vx += unit_x; |
michael@0 | 5365 | while (vx >= 0) |
michael@0 | 5366 | vx -= src_width_fixed; |
michael@0 | 5367 | |
michael@0 | 5368 | tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); |
michael@0 | 5369 | |
michael@0 | 5370 | xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm); |
michael@0 | 5371 | |
michael@0 | 5372 | if (is_opaque (xmm_src_hi)) |
michael@0 | 5373 | { |
michael@0 | 5374 | save_128_aligned ((__m128i*)pd, xmm_src_hi); |
michael@0 | 5375 | } |
michael@0 | 5376 | else if (!is_zero (xmm_src_hi)) |
michael@0 | 5377 | { |
michael@0 | 5378 | xmm_dst_hi = load_128_aligned ((__m128i*) pd); |
michael@0 | 5379 | |
michael@0 | 5380 | unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 5381 | unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5382 | |
michael@0 | 5383 | expand_alpha_2x128 ( |
michael@0 | 5384 | xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 5385 | |
michael@0 | 5386 | over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 5387 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 5388 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5389 | |
michael@0 | 5390 | /* rebuid the 4 pixel data and save*/ |
michael@0 | 5391 | save_128_aligned ((__m128i*)pd, |
michael@0 | 5392 | pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 5393 | } |
michael@0 | 5394 | |
michael@0 | 5395 | w -= 4; |
michael@0 | 5396 | pd += 4; |
michael@0 | 5397 | if (pm) |
michael@0 | 5398 | pm += 4; |
michael@0 | 5399 | } |
michael@0 | 5400 | |
michael@0 | 5401 | while (w) |
michael@0 | 5402 | { |
michael@0 | 5403 | d = *pd; |
michael@0 | 5404 | s = combine1 (ps + pixman_fixed_to_int (vx), pm); |
michael@0 | 5405 | vx += unit_x; |
michael@0 | 5406 | while (vx >= 0) |
michael@0 | 5407 | vx -= src_width_fixed; |
michael@0 | 5408 | |
michael@0 | 5409 | *pd++ = core_combine_over_u_pixel_sse2 (s, d); |
michael@0 | 5410 | if (pm) |
michael@0 | 5411 | pm++; |
michael@0 | 5412 | |
michael@0 | 5413 | w--; |
michael@0 | 5414 | } |
michael@0 | 5415 | } |
michael@0 | 5416 | |
michael@0 | 5417 | FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER, |
michael@0 | 5418 | scaled_nearest_scanline_sse2_8888_8888_OVER, |
michael@0 | 5419 | uint32_t, uint32_t, COVER) |
michael@0 | 5420 | FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER, |
michael@0 | 5421 | scaled_nearest_scanline_sse2_8888_8888_OVER, |
michael@0 | 5422 | uint32_t, uint32_t, NONE) |
michael@0 | 5423 | FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER, |
michael@0 | 5424 | scaled_nearest_scanline_sse2_8888_8888_OVER, |
michael@0 | 5425 | uint32_t, uint32_t, PAD) |
michael@0 | 5426 | FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER, |
michael@0 | 5427 | scaled_nearest_scanline_sse2_8888_8888_OVER, |
michael@0 | 5428 | uint32_t, uint32_t, NORMAL) |
michael@0 | 5429 | |
michael@0 | 5430 | static force_inline void |
michael@0 | 5431 | scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask, |
michael@0 | 5432 | uint32_t * dst, |
michael@0 | 5433 | const uint32_t * src, |
michael@0 | 5434 | int32_t w, |
michael@0 | 5435 | pixman_fixed_t vx, |
michael@0 | 5436 | pixman_fixed_t unit_x, |
michael@0 | 5437 | pixman_fixed_t src_width_fixed, |
michael@0 | 5438 | pixman_bool_t zero_src) |
michael@0 | 5439 | { |
michael@0 | 5440 | __m128i xmm_mask; |
michael@0 | 5441 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 5442 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 5443 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 5444 | |
michael@0 | 5445 | if (zero_src || (*mask >> 24) == 0) |
michael@0 | 5446 | return; |
michael@0 | 5447 | |
michael@0 | 5448 | xmm_mask = create_mask_16_128 (*mask >> 24); |
michael@0 | 5449 | |
michael@0 | 5450 | while (w && (uintptr_t)dst & 15) |
michael@0 | 5451 | { |
michael@0 | 5452 | uint32_t s = *(src + pixman_fixed_to_int (vx)); |
michael@0 | 5453 | vx += unit_x; |
michael@0 | 5454 | while (vx >= 0) |
michael@0 | 5455 | vx -= src_width_fixed; |
michael@0 | 5456 | |
michael@0 | 5457 | if (s) |
michael@0 | 5458 | { |
michael@0 | 5459 | uint32_t d = *dst; |
michael@0 | 5460 | |
michael@0 | 5461 | __m128i ms = unpack_32_1x128 (s); |
michael@0 | 5462 | __m128i alpha = expand_alpha_1x128 (ms); |
michael@0 | 5463 | __m128i dest = xmm_mask; |
michael@0 | 5464 | __m128i alpha_dst = unpack_32_1x128 (d); |
michael@0 | 5465 | |
michael@0 | 5466 | *dst = pack_1x128_32 ( |
michael@0 | 5467 | in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
michael@0 | 5468 | } |
michael@0 | 5469 | dst++; |
michael@0 | 5470 | w--; |
michael@0 | 5471 | } |
michael@0 | 5472 | |
michael@0 | 5473 | while (w >= 4) |
michael@0 | 5474 | { |
michael@0 | 5475 | uint32_t tmp1, tmp2, tmp3, tmp4; |
michael@0 | 5476 | |
michael@0 | 5477 | tmp1 = *(src + pixman_fixed_to_int (vx)); |
michael@0 | 5478 | vx += unit_x; |
michael@0 | 5479 | while (vx >= 0) |
michael@0 | 5480 | vx -= src_width_fixed; |
michael@0 | 5481 | tmp2 = *(src + pixman_fixed_to_int (vx)); |
michael@0 | 5482 | vx += unit_x; |
michael@0 | 5483 | while (vx >= 0) |
michael@0 | 5484 | vx -= src_width_fixed; |
michael@0 | 5485 | tmp3 = *(src + pixman_fixed_to_int (vx)); |
michael@0 | 5486 | vx += unit_x; |
michael@0 | 5487 | while (vx >= 0) |
michael@0 | 5488 | vx -= src_width_fixed; |
michael@0 | 5489 | tmp4 = *(src + pixman_fixed_to_int (vx)); |
michael@0 | 5490 | vx += unit_x; |
michael@0 | 5491 | while (vx >= 0) |
michael@0 | 5492 | vx -= src_width_fixed; |
michael@0 | 5493 | |
michael@0 | 5494 | xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1); |
michael@0 | 5495 | |
michael@0 | 5496 | if (!is_zero (xmm_src)) |
michael@0 | 5497 | { |
michael@0 | 5498 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 5499 | |
michael@0 | 5500 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 5501 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5502 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 5503 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 5504 | |
michael@0 | 5505 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 5506 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 5507 | &xmm_mask, &xmm_mask, |
michael@0 | 5508 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5509 | |
michael@0 | 5510 | save_128_aligned ( |
michael@0 | 5511 | (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 5512 | } |
michael@0 | 5513 | |
michael@0 | 5514 | dst += 4; |
michael@0 | 5515 | w -= 4; |
michael@0 | 5516 | } |
michael@0 | 5517 | |
michael@0 | 5518 | while (w) |
michael@0 | 5519 | { |
michael@0 | 5520 | uint32_t s = *(src + pixman_fixed_to_int (vx)); |
michael@0 | 5521 | vx += unit_x; |
michael@0 | 5522 | while (vx >= 0) |
michael@0 | 5523 | vx -= src_width_fixed; |
michael@0 | 5524 | |
michael@0 | 5525 | if (s) |
michael@0 | 5526 | { |
michael@0 | 5527 | uint32_t d = *dst; |
michael@0 | 5528 | |
michael@0 | 5529 | __m128i ms = unpack_32_1x128 (s); |
michael@0 | 5530 | __m128i alpha = expand_alpha_1x128 (ms); |
michael@0 | 5531 | __m128i mask = xmm_mask; |
michael@0 | 5532 | __m128i dest = unpack_32_1x128 (d); |
michael@0 | 5533 | |
michael@0 | 5534 | *dst = pack_1x128_32 ( |
michael@0 | 5535 | in_over_1x128 (&ms, &alpha, &mask, &dest)); |
michael@0 | 5536 | } |
michael@0 | 5537 | |
michael@0 | 5538 | dst++; |
michael@0 | 5539 | w--; |
michael@0 | 5540 | } |
michael@0 | 5541 | |
michael@0 | 5542 | } |
michael@0 | 5543 | |
michael@0 | 5544 | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, |
michael@0 | 5545 | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
michael@0 | 5546 | uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE) |
michael@0 | 5547 | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, |
michael@0 | 5548 | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
michael@0 | 5549 | uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE) |
michael@0 | 5550 | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, |
michael@0 | 5551 | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
michael@0 | 5552 | uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE) |
michael@0 | 5553 | FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, |
michael@0 | 5554 | scaled_nearest_scanline_sse2_8888_n_8888_OVER, |
michael@0 | 5555 | uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE) |
michael@0 | 5556 | |
michael@0 | 5557 | #define BMSK ((1 << BILINEAR_INTERPOLATION_BITS) - 1) |
michael@0 | 5558 | |
michael@0 | 5559 | #define BILINEAR_DECLARE_VARIABLES \ |
michael@0 | 5560 | const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \ |
michael@0 | 5561 | const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \ |
michael@0 | 5562 | const __m128i xmm_xorc8 = _mm_set_epi16 (0, 0, 0, 0, BMSK, BMSK, BMSK, BMSK);\ |
michael@0 | 5563 | const __m128i xmm_addc8 = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \ |
michael@0 | 5564 | const __m128i xmm_xorc7 = _mm_set_epi16 (0, BMSK, 0, BMSK, 0, BMSK, 0, BMSK);\ |
michael@0 | 5565 | const __m128i xmm_addc7 = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \ |
michael@0 | 5566 | const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \ |
michael@0 | 5567 | unit_x, unit_x, unit_x, unit_x); \ |
michael@0 | 5568 | const __m128i xmm_zero = _mm_setzero_si128 (); \ |
michael@0 | 5569 | __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, vx, vx, vx, vx) |
michael@0 | 5570 | |
michael@0 | 5571 | #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ |
michael@0 | 5572 | do { \ |
michael@0 | 5573 | __m128i xmm_wh, xmm_lo, xmm_hi, a; \ |
michael@0 | 5574 | /* fetch 2x2 pixel block into sse2 registers */ \ |
michael@0 | 5575 | __m128i tltr = _mm_loadl_epi64 ( \ |
michael@0 | 5576 | (__m128i *)&src_top[pixman_fixed_to_int (vx)]); \ |
michael@0 | 5577 | __m128i blbr = _mm_loadl_epi64 ( \ |
michael@0 | 5578 | (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \ |
michael@0 | 5579 | vx += unit_x; \ |
michael@0 | 5580 | /* vertical interpolation */ \ |
michael@0 | 5581 | a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \ |
michael@0 | 5582 | xmm_wt), \ |
michael@0 | 5583 | _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \ |
michael@0 | 5584 | xmm_wb)); \ |
michael@0 | 5585 | if (BILINEAR_INTERPOLATION_BITS < 8) \ |
michael@0 | 5586 | { \ |
michael@0 | 5587 | /* calculate horizontal weights */ \ |
michael@0 | 5588 | xmm_wh = _mm_add_epi16 (xmm_addc7, _mm_xor_si128 (xmm_xorc7, \ |
michael@0 | 5589 | _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ |
michael@0 | 5590 | xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ |
michael@0 | 5591 | /* horizontal interpolation */ \ |
michael@0 | 5592 | a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \ |
michael@0 | 5593 | a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \ |
michael@0 | 5594 | } \ |
michael@0 | 5595 | else \ |
michael@0 | 5596 | { \ |
michael@0 | 5597 | /* calculate horizontal weights */ \ |
michael@0 | 5598 | xmm_wh = _mm_add_epi16 (xmm_addc8, _mm_xor_si128 (xmm_xorc8, \ |
michael@0 | 5599 | _mm_srli_epi16 (xmm_x, 16 - BILINEAR_INTERPOLATION_BITS))); \ |
michael@0 | 5600 | xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ |
michael@0 | 5601 | /* horizontal interpolation */ \ |
michael@0 | 5602 | xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \ |
michael@0 | 5603 | xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \ |
michael@0 | 5604 | a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \ |
michael@0 | 5605 | _mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \ |
michael@0 | 5606 | } \ |
michael@0 | 5607 | /* shift and pack the result */ \ |
michael@0 | 5608 | a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \ |
michael@0 | 5609 | a = _mm_packs_epi32 (a, a); \ |
michael@0 | 5610 | a = _mm_packus_epi16 (a, a); \ |
michael@0 | 5611 | pix = _mm_cvtsi128_si32 (a); \ |
michael@0 | 5612 | } while (0) |
michael@0 | 5613 | |
michael@0 | 5614 | #define BILINEAR_SKIP_ONE_PIXEL() \ |
michael@0 | 5615 | do { \ |
michael@0 | 5616 | vx += unit_x; \ |
michael@0 | 5617 | xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \ |
michael@0 | 5618 | } while(0) |
michael@0 | 5619 | |
michael@0 | 5620 | static force_inline void |
michael@0 | 5621 | scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst, |
michael@0 | 5622 | const uint32_t * mask, |
michael@0 | 5623 | const uint32_t * src_top, |
michael@0 | 5624 | const uint32_t * src_bottom, |
michael@0 | 5625 | int32_t w, |
michael@0 | 5626 | int wt, |
michael@0 | 5627 | int wb, |
michael@0 | 5628 | pixman_fixed_t vx, |
michael@0 | 5629 | pixman_fixed_t unit_x, |
michael@0 | 5630 | pixman_fixed_t max_vx, |
michael@0 | 5631 | pixman_bool_t zero_src) |
michael@0 | 5632 | { |
michael@0 | 5633 | BILINEAR_DECLARE_VARIABLES; |
michael@0 | 5634 | uint32_t pix1, pix2, pix3, pix4; |
michael@0 | 5635 | |
michael@0 | 5636 | while ((w -= 4) >= 0) |
michael@0 | 5637 | { |
michael@0 | 5638 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5639 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
michael@0 | 5640 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
michael@0 | 5641 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
michael@0 | 5642 | *dst++ = pix1; |
michael@0 | 5643 | *dst++ = pix2; |
michael@0 | 5644 | *dst++ = pix3; |
michael@0 | 5645 | *dst++ = pix4; |
michael@0 | 5646 | } |
michael@0 | 5647 | |
michael@0 | 5648 | if (w & 2) |
michael@0 | 5649 | { |
michael@0 | 5650 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5651 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
michael@0 | 5652 | *dst++ = pix1; |
michael@0 | 5653 | *dst++ = pix2; |
michael@0 | 5654 | } |
michael@0 | 5655 | |
michael@0 | 5656 | if (w & 1) |
michael@0 | 5657 | { |
michael@0 | 5658 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5659 | *dst = pix1; |
michael@0 | 5660 | } |
michael@0 | 5661 | |
michael@0 | 5662 | } |
michael@0 | 5663 | |
michael@0 | 5664 | /* Add extra NULL argument to the existing bilinear fast paths to indicate |
michael@0 | 5665 | * that we don't need two-pass processing */ |
michael@0 | 5666 | |
michael@0 | 5667 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC, |
michael@0 | 5668 | scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
michael@0 | 5669 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5670 | COVER, FLAG_NONE) |
michael@0 | 5671 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC, |
michael@0 | 5672 | scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
michael@0 | 5673 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5674 | PAD, FLAG_NONE) |
michael@0 | 5675 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC, |
michael@0 | 5676 | scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
michael@0 | 5677 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5678 | NONE, FLAG_NONE) |
michael@0 | 5679 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC, |
michael@0 | 5680 | scaled_bilinear_scanline_sse2_8888_8888_SRC, NULL, |
michael@0 | 5681 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5682 | NORMAL, FLAG_NONE) |
michael@0 | 5683 | |
michael@0 | 5684 | static force_inline void |
michael@0 | 5685 | scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst, |
michael@0 | 5686 | const uint32_t * mask, |
michael@0 | 5687 | const uint32_t * src_top, |
michael@0 | 5688 | const uint32_t * src_bottom, |
michael@0 | 5689 | int32_t w, |
michael@0 | 5690 | int wt, |
michael@0 | 5691 | int wb, |
michael@0 | 5692 | pixman_fixed_t vx, |
michael@0 | 5693 | pixman_fixed_t unit_x, |
michael@0 | 5694 | pixman_fixed_t max_vx, |
michael@0 | 5695 | pixman_bool_t zero_src) |
michael@0 | 5696 | { |
michael@0 | 5697 | BILINEAR_DECLARE_VARIABLES; |
michael@0 | 5698 | uint32_t pix1, pix2, pix3, pix4; |
michael@0 | 5699 | |
michael@0 | 5700 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 5701 | { |
michael@0 | 5702 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5703 | |
michael@0 | 5704 | if (pix1) |
michael@0 | 5705 | { |
michael@0 | 5706 | pix2 = *dst; |
michael@0 | 5707 | *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); |
michael@0 | 5708 | } |
michael@0 | 5709 | |
michael@0 | 5710 | w--; |
michael@0 | 5711 | dst++; |
michael@0 | 5712 | } |
michael@0 | 5713 | |
michael@0 | 5714 | while (w >= 4) |
michael@0 | 5715 | { |
michael@0 | 5716 | __m128i xmm_src; |
michael@0 | 5717 | __m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo; |
michael@0 | 5718 | __m128i xmm_alpha_hi, xmm_alpha_lo; |
michael@0 | 5719 | |
michael@0 | 5720 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5721 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
michael@0 | 5722 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
michael@0 | 5723 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
michael@0 | 5724 | |
michael@0 | 5725 | xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); |
michael@0 | 5726 | |
michael@0 | 5727 | if (!is_zero (xmm_src)) |
michael@0 | 5728 | { |
michael@0 | 5729 | if (is_opaque (xmm_src)) |
michael@0 | 5730 | { |
michael@0 | 5731 | save_128_aligned ((__m128i *)dst, xmm_src); |
michael@0 | 5732 | } |
michael@0 | 5733 | else |
michael@0 | 5734 | { |
michael@0 | 5735 | __m128i xmm_dst = load_128_aligned ((__m128i *)dst); |
michael@0 | 5736 | |
michael@0 | 5737 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 5738 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5739 | |
michael@0 | 5740 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 5741 | over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 5742 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5743 | |
michael@0 | 5744 | save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 5745 | } |
michael@0 | 5746 | } |
michael@0 | 5747 | |
michael@0 | 5748 | w -= 4; |
michael@0 | 5749 | dst += 4; |
michael@0 | 5750 | } |
michael@0 | 5751 | |
michael@0 | 5752 | while (w) |
michael@0 | 5753 | { |
michael@0 | 5754 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5755 | |
michael@0 | 5756 | if (pix1) |
michael@0 | 5757 | { |
michael@0 | 5758 | pix2 = *dst; |
michael@0 | 5759 | *dst = core_combine_over_u_pixel_sse2 (pix1, pix2); |
michael@0 | 5760 | } |
michael@0 | 5761 | |
michael@0 | 5762 | w--; |
michael@0 | 5763 | dst++; |
michael@0 | 5764 | } |
michael@0 | 5765 | } |
michael@0 | 5766 | |
michael@0 | 5767 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER, |
michael@0 | 5768 | scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
michael@0 | 5769 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5770 | COVER, FLAG_NONE) |
michael@0 | 5771 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER, |
michael@0 | 5772 | scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
michael@0 | 5773 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5774 | PAD, FLAG_NONE) |
michael@0 | 5775 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER, |
michael@0 | 5776 | scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
michael@0 | 5777 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5778 | NONE, FLAG_NONE) |
michael@0 | 5779 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER, |
michael@0 | 5780 | scaled_bilinear_scanline_sse2_8888_8888_OVER, NULL, |
michael@0 | 5781 | uint32_t, uint32_t, uint32_t, |
michael@0 | 5782 | NORMAL, FLAG_NONE) |
michael@0 | 5783 | |
michael@0 | 5784 | |
michael@0 | 5785 | /* An example of SSE2 two-stage bilinear_over_8888_0565 fast path, which is implemented |
michael@0 | 5786 | as scaled_bilinear_scanline_sse2_8888_8888_SRC + op_bilinear_over_8888_0565 */ |
michael@0 | 5787 | |
michael@0 | 5788 | void op_bilinear_over_8888_0565(uint16_t *dst, const uint32_t *mask, const uint32_t *src, int width) |
michael@0 | 5789 | { |
michael@0 | 5790 | /* Note: this is not really fast and should be based on 8 pixel loop from sse2_composite_over_8888_0565 */ |
michael@0 | 5791 | while (--width >= 0) |
michael@0 | 5792 | { |
michael@0 | 5793 | *dst = composite_over_8888_0565pixel (*src, *dst); |
michael@0 | 5794 | src++; |
michael@0 | 5795 | dst++; |
michael@0 | 5796 | } |
michael@0 | 5797 | } |
michael@0 | 5798 | |
michael@0 | 5799 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_cover_OVER, |
michael@0 | 5800 | scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
michael@0 | 5801 | uint32_t, uint32_t, uint16_t, |
michael@0 | 5802 | COVER, FLAG_NONE) |
michael@0 | 5803 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_pad_OVER, |
michael@0 | 5804 | scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
michael@0 | 5805 | uint32_t, uint32_t, uint16_t, |
michael@0 | 5806 | PAD, FLAG_NONE) |
michael@0 | 5807 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_none_OVER, |
michael@0 | 5808 | scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
michael@0 | 5809 | uint32_t, uint32_t, uint16_t, |
michael@0 | 5810 | NONE, FLAG_NONE) |
michael@0 | 5811 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_0565_normal_OVER, |
michael@0 | 5812 | scaled_bilinear_scanline_sse2_8888_8888_SRC, op_bilinear_over_8888_0565, |
michael@0 | 5813 | uint32_t, uint32_t, uint16_t, |
michael@0 | 5814 | NORMAL, FLAG_NONE) |
michael@0 | 5815 | |
michael@0 | 5816 | /*****************************/ |
michael@0 | 5817 | |
michael@0 | 5818 | static force_inline void |
michael@0 | 5819 | scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst, |
michael@0 | 5820 | const uint8_t * mask, |
michael@0 | 5821 | const uint32_t * src_top, |
michael@0 | 5822 | const uint32_t * src_bottom, |
michael@0 | 5823 | int32_t w, |
michael@0 | 5824 | int wt, |
michael@0 | 5825 | int wb, |
michael@0 | 5826 | pixman_fixed_t vx, |
michael@0 | 5827 | pixman_fixed_t unit_x, |
michael@0 | 5828 | pixman_fixed_t max_vx, |
michael@0 | 5829 | pixman_bool_t zero_src) |
michael@0 | 5830 | { |
michael@0 | 5831 | BILINEAR_DECLARE_VARIABLES; |
michael@0 | 5832 | uint32_t pix1, pix2, pix3, pix4; |
michael@0 | 5833 | uint32_t m; |
michael@0 | 5834 | |
michael@0 | 5835 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 5836 | { |
michael@0 | 5837 | uint32_t sa; |
michael@0 | 5838 | |
michael@0 | 5839 | m = (uint32_t) *mask++; |
michael@0 | 5840 | |
michael@0 | 5841 | if (m) |
michael@0 | 5842 | { |
michael@0 | 5843 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5844 | sa = pix1 >> 24; |
michael@0 | 5845 | |
michael@0 | 5846 | if (sa == 0xff && m == 0xff) |
michael@0 | 5847 | { |
michael@0 | 5848 | *dst = pix1; |
michael@0 | 5849 | } |
michael@0 | 5850 | else |
michael@0 | 5851 | { |
michael@0 | 5852 | __m128i ms, md, ma, msa; |
michael@0 | 5853 | |
michael@0 | 5854 | pix2 = *dst; |
michael@0 | 5855 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
michael@0 | 5856 | ms = unpack_32_1x128 (pix1); |
michael@0 | 5857 | md = unpack_32_1x128 (pix2); |
michael@0 | 5858 | |
michael@0 | 5859 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
michael@0 | 5860 | |
michael@0 | 5861 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
michael@0 | 5862 | } |
michael@0 | 5863 | } |
michael@0 | 5864 | else |
michael@0 | 5865 | { |
michael@0 | 5866 | BILINEAR_SKIP_ONE_PIXEL (); |
michael@0 | 5867 | } |
michael@0 | 5868 | |
michael@0 | 5869 | w--; |
michael@0 | 5870 | dst++; |
michael@0 | 5871 | } |
michael@0 | 5872 | |
michael@0 | 5873 | while (w >= 4) |
michael@0 | 5874 | { |
michael@0 | 5875 | __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi; |
michael@0 | 5876 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 5877 | __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi; |
michael@0 | 5878 | |
michael@0 | 5879 | m = *(uint32_t*)mask; |
michael@0 | 5880 | |
michael@0 | 5881 | if (m) |
michael@0 | 5882 | { |
michael@0 | 5883 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5884 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
michael@0 | 5885 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
michael@0 | 5886 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
michael@0 | 5887 | |
michael@0 | 5888 | xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); |
michael@0 | 5889 | |
michael@0 | 5890 | if (m == 0xffffffff && is_opaque (xmm_src)) |
michael@0 | 5891 | { |
michael@0 | 5892 | save_128_aligned ((__m128i *)dst, xmm_src); |
michael@0 | 5893 | } |
michael@0 | 5894 | else |
michael@0 | 5895 | { |
michael@0 | 5896 | xmm_dst = load_128_aligned ((__m128i *)dst); |
michael@0 | 5897 | |
michael@0 | 5898 | xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128()); |
michael@0 | 5899 | |
michael@0 | 5900 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 5901 | unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 5902 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5903 | |
michael@0 | 5904 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi); |
michael@0 | 5905 | expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi); |
michael@0 | 5906 | |
michael@0 | 5907 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi, |
michael@0 | 5908 | &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 5909 | |
michael@0 | 5910 | save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 5911 | } |
michael@0 | 5912 | } |
michael@0 | 5913 | else |
michael@0 | 5914 | { |
michael@0 | 5915 | BILINEAR_SKIP_ONE_PIXEL (); |
michael@0 | 5916 | BILINEAR_SKIP_ONE_PIXEL (); |
michael@0 | 5917 | BILINEAR_SKIP_ONE_PIXEL (); |
michael@0 | 5918 | BILINEAR_SKIP_ONE_PIXEL (); |
michael@0 | 5919 | } |
michael@0 | 5920 | |
michael@0 | 5921 | w -= 4; |
michael@0 | 5922 | dst += 4; |
michael@0 | 5923 | mask += 4; |
michael@0 | 5924 | } |
michael@0 | 5925 | |
michael@0 | 5926 | while (w) |
michael@0 | 5927 | { |
michael@0 | 5928 | uint32_t sa; |
michael@0 | 5929 | |
michael@0 | 5930 | m = (uint32_t) *mask++; |
michael@0 | 5931 | |
michael@0 | 5932 | if (m) |
michael@0 | 5933 | { |
michael@0 | 5934 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 5935 | sa = pix1 >> 24; |
michael@0 | 5936 | |
michael@0 | 5937 | if (sa == 0xff && m == 0xff) |
michael@0 | 5938 | { |
michael@0 | 5939 | *dst = pix1; |
michael@0 | 5940 | } |
michael@0 | 5941 | else |
michael@0 | 5942 | { |
michael@0 | 5943 | __m128i ms, md, ma, msa; |
michael@0 | 5944 | |
michael@0 | 5945 | pix2 = *dst; |
michael@0 | 5946 | ma = expand_alpha_rev_1x128 (load_32_1x128 (m)); |
michael@0 | 5947 | ms = unpack_32_1x128 (pix1); |
michael@0 | 5948 | md = unpack_32_1x128 (pix2); |
michael@0 | 5949 | |
michael@0 | 5950 | msa = expand_alpha_rev_1x128 (load_32_1x128 (sa)); |
michael@0 | 5951 | |
michael@0 | 5952 | *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md)); |
michael@0 | 5953 | } |
michael@0 | 5954 | } |
michael@0 | 5955 | else |
michael@0 | 5956 | { |
michael@0 | 5957 | BILINEAR_SKIP_ONE_PIXEL (); |
michael@0 | 5958 | } |
michael@0 | 5959 | |
michael@0 | 5960 | w--; |
michael@0 | 5961 | dst++; |
michael@0 | 5962 | } |
michael@0 | 5963 | } |
michael@0 | 5964 | |
michael@0 | 5965 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER, |
michael@0 | 5966 | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
michael@0 | 5967 | uint32_t, uint8_t, uint32_t, |
michael@0 | 5968 | COVER, FLAG_HAVE_NON_SOLID_MASK) |
michael@0 | 5969 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER, |
michael@0 | 5970 | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
michael@0 | 5971 | uint32_t, uint8_t, uint32_t, |
michael@0 | 5972 | PAD, FLAG_HAVE_NON_SOLID_MASK) |
michael@0 | 5973 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER, |
michael@0 | 5974 | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
michael@0 | 5975 | uint32_t, uint8_t, uint32_t, |
michael@0 | 5976 | NONE, FLAG_HAVE_NON_SOLID_MASK) |
michael@0 | 5977 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER, |
michael@0 | 5978 | scaled_bilinear_scanline_sse2_8888_8_8888_OVER, NULL, |
michael@0 | 5979 | uint32_t, uint8_t, uint32_t, |
michael@0 | 5980 | NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
michael@0 | 5981 | |
michael@0 | 5982 | static force_inline void |
michael@0 | 5983 | scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst, |
michael@0 | 5984 | const uint32_t * mask, |
michael@0 | 5985 | const uint32_t * src_top, |
michael@0 | 5986 | const uint32_t * src_bottom, |
michael@0 | 5987 | int32_t w, |
michael@0 | 5988 | int wt, |
michael@0 | 5989 | int wb, |
michael@0 | 5990 | pixman_fixed_t vx, |
michael@0 | 5991 | pixman_fixed_t unit_x, |
michael@0 | 5992 | pixman_fixed_t max_vx, |
michael@0 | 5993 | pixman_bool_t zero_src) |
michael@0 | 5994 | { |
michael@0 | 5995 | BILINEAR_DECLARE_VARIABLES; |
michael@0 | 5996 | uint32_t pix1, pix2, pix3, pix4; |
michael@0 | 5997 | __m128i xmm_mask; |
michael@0 | 5998 | |
michael@0 | 5999 | if (zero_src || (*mask >> 24) == 0) |
michael@0 | 6000 | return; |
michael@0 | 6001 | |
michael@0 | 6002 | xmm_mask = create_mask_16_128 (*mask >> 24); |
michael@0 | 6003 | |
michael@0 | 6004 | while (w && ((uintptr_t)dst & 15)) |
michael@0 | 6005 | { |
michael@0 | 6006 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 6007 | if (pix1) |
michael@0 | 6008 | { |
michael@0 | 6009 | uint32_t d = *dst; |
michael@0 | 6010 | |
michael@0 | 6011 | __m128i ms = unpack_32_1x128 (pix1); |
michael@0 | 6012 | __m128i alpha = expand_alpha_1x128 (ms); |
michael@0 | 6013 | __m128i dest = xmm_mask; |
michael@0 | 6014 | __m128i alpha_dst = unpack_32_1x128 (d); |
michael@0 | 6015 | |
michael@0 | 6016 | *dst = pack_1x128_32 |
michael@0 | 6017 | (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
michael@0 | 6018 | } |
michael@0 | 6019 | |
michael@0 | 6020 | dst++; |
michael@0 | 6021 | w--; |
michael@0 | 6022 | } |
michael@0 | 6023 | |
michael@0 | 6024 | while (w >= 4) |
michael@0 | 6025 | { |
michael@0 | 6026 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 6027 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix2); |
michael@0 | 6028 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix3); |
michael@0 | 6029 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix4); |
michael@0 | 6030 | |
michael@0 | 6031 | if (pix1 | pix2 | pix3 | pix4) |
michael@0 | 6032 | { |
michael@0 | 6033 | __m128i xmm_src, xmm_src_lo, xmm_src_hi; |
michael@0 | 6034 | __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi; |
michael@0 | 6035 | __m128i xmm_alpha_lo, xmm_alpha_hi; |
michael@0 | 6036 | |
michael@0 | 6037 | xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1); |
michael@0 | 6038 | |
michael@0 | 6039 | xmm_dst = load_128_aligned ((__m128i*)dst); |
michael@0 | 6040 | |
michael@0 | 6041 | unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi); |
michael@0 | 6042 | unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 6043 | expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, |
michael@0 | 6044 | &xmm_alpha_lo, &xmm_alpha_hi); |
michael@0 | 6045 | |
michael@0 | 6046 | in_over_2x128 (&xmm_src_lo, &xmm_src_hi, |
michael@0 | 6047 | &xmm_alpha_lo, &xmm_alpha_hi, |
michael@0 | 6048 | &xmm_mask, &xmm_mask, |
michael@0 | 6049 | &xmm_dst_lo, &xmm_dst_hi); |
michael@0 | 6050 | |
michael@0 | 6051 | save_128_aligned |
michael@0 | 6052 | ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi)); |
michael@0 | 6053 | } |
michael@0 | 6054 | |
michael@0 | 6055 | dst += 4; |
michael@0 | 6056 | w -= 4; |
michael@0 | 6057 | } |
michael@0 | 6058 | |
michael@0 | 6059 | while (w) |
michael@0 | 6060 | { |
michael@0 | 6061 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
michael@0 | 6062 | if (pix1) |
michael@0 | 6063 | { |
michael@0 | 6064 | uint32_t d = *dst; |
michael@0 | 6065 | |
michael@0 | 6066 | __m128i ms = unpack_32_1x128 (pix1); |
michael@0 | 6067 | __m128i alpha = expand_alpha_1x128 (ms); |
michael@0 | 6068 | __m128i dest = xmm_mask; |
michael@0 | 6069 | __m128i alpha_dst = unpack_32_1x128 (d); |
michael@0 | 6070 | |
michael@0 | 6071 | *dst = pack_1x128_32 |
michael@0 | 6072 | (in_over_1x128 (&ms, &alpha, &dest, &alpha_dst)); |
michael@0 | 6073 | } |
michael@0 | 6074 | |
michael@0 | 6075 | dst++; |
michael@0 | 6076 | w--; |
michael@0 | 6077 | } |
michael@0 | 6078 | } |
michael@0 | 6079 | |
michael@0 | 6080 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER, |
michael@0 | 6081 | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
michael@0 | 6082 | uint32_t, uint32_t, uint32_t, |
michael@0 | 6083 | COVER, FLAG_HAVE_SOLID_MASK) |
michael@0 | 6084 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER, |
michael@0 | 6085 | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
michael@0 | 6086 | uint32_t, uint32_t, uint32_t, |
michael@0 | 6087 | PAD, FLAG_HAVE_SOLID_MASK) |
michael@0 | 6088 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER, |
michael@0 | 6089 | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
michael@0 | 6090 | uint32_t, uint32_t, uint32_t, |
michael@0 | 6091 | NONE, FLAG_HAVE_SOLID_MASK) |
michael@0 | 6092 | FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER, |
michael@0 | 6093 | scaled_bilinear_scanline_sse2_8888_n_8888_OVER, NULL, |
michael@0 | 6094 | uint32_t, uint32_t, uint32_t, |
michael@0 | 6095 | NORMAL, FLAG_HAVE_SOLID_MASK) |
michael@0 | 6096 | |
michael@0 | 6097 | static const pixman_fast_path_t sse2_fast_paths[] = |
michael@0 | 6098 | { |
michael@0 | 6099 | /* PIXMAN_OP_OVER */ |
michael@0 | 6100 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565), |
michael@0 | 6101 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565), |
michael@0 | 6102 | PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888), |
michael@0 | 6103 | PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888), |
michael@0 | 6104 | PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565), |
michael@0 | 6105 | PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565), |
michael@0 | 6106 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888), |
michael@0 | 6107 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888), |
michael@0 | 6108 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888), |
michael@0 | 6109 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888), |
michael@0 | 6110 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565), |
michael@0 | 6111 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565), |
michael@0 | 6112 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888), |
michael@0 | 6113 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888), |
michael@0 | 6114 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888), |
michael@0 | 6115 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888), |
michael@0 | 6116 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888), |
michael@0 | 6117 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888), |
michael@0 | 6118 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888), |
michael@0 | 6119 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888), |
michael@0 | 6120 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888), |
michael@0 | 6121 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888), |
michael@0 | 6122 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888), |
michael@0 | 6123 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888), |
michael@0 | 6124 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888), |
michael@0 | 6125 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888), |
michael@0 | 6126 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888), |
michael@0 | 6127 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888), |
michael@0 | 6128 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888), |
michael@0 | 6129 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888), |
michael@0 | 6130 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888), |
michael@0 | 6131 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888), |
michael@0 | 6132 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888), |
michael@0 | 6133 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca), |
michael@0 | 6134 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca), |
michael@0 | 6135 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca), |
michael@0 | 6136 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca), |
michael@0 | 6137 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca), |
michael@0 | 6138 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca), |
michael@0 | 6139 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888), |
michael@0 | 6140 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888), |
michael@0 | 6141 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888), |
michael@0 | 6142 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888), |
michael@0 | 6143 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565), |
michael@0 | 6144 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565), |
michael@0 | 6145 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
michael@0 | 6146 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
michael@0 | 6147 | |
michael@0 | 6148 | /* PIXMAN_OP_OVER_REVERSE */ |
michael@0 | 6149 | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888), |
michael@0 | 6150 | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888), |
michael@0 | 6151 | |
michael@0 | 6152 | /* PIXMAN_OP_ADD */ |
michael@0 | 6153 | PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca), |
michael@0 | 6154 | PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8), |
michael@0 | 6155 | PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888), |
michael@0 | 6156 | PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888), |
michael@0 | 6157 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8), |
michael@0 | 6158 | PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8), |
michael@0 | 6159 | PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888), |
michael@0 | 6160 | PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888), |
michael@0 | 6161 | PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888), |
michael@0 | 6162 | PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888), |
michael@0 | 6163 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888), |
michael@0 | 6164 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888), |
michael@0 | 6165 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888), |
michael@0 | 6166 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888), |
michael@0 | 6167 | |
michael@0 | 6168 | /* PIXMAN_OP_SRC */ |
michael@0 | 6169 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888), |
michael@0 | 6170 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888), |
michael@0 | 6171 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888), |
michael@0 | 6172 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888), |
michael@0 | 6173 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), |
michael@0 | 6174 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), |
michael@0 | 6175 | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565), |
michael@0 | 6176 | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565), |
michael@0 | 6177 | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888), |
michael@0 | 6178 | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888), |
michael@0 | 6179 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area), |
michael@0 | 6180 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area), |
michael@0 | 6181 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
michael@0 | 6182 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
michael@0 | 6183 | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area), |
michael@0 | 6184 | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area), |
michael@0 | 6185 | PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area), |
michael@0 | 6186 | PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area), |
michael@0 | 6187 | |
michael@0 | 6188 | /* PIXMAN_OP_IN */ |
michael@0 | 6189 | PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8), |
michael@0 | 6190 | PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8), |
michael@0 | 6191 | PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8), |
michael@0 | 6192 | |
michael@0 | 6193 | SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6194 | SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6195 | SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
michael@0 | 6196 | SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
michael@0 | 6197 | SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6198 | SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6199 | SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
michael@0 | 6200 | SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
michael@0 | 6201 | SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6202 | SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6203 | SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
michael@0 | 6204 | SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
michael@0 | 6205 | SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6206 | SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6207 | SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
michael@0 | 6208 | SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
michael@0 | 6209 | |
michael@0 | 6210 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
michael@0 | 6211 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
michael@0 | 6212 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
michael@0 | 6213 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
michael@0 | 6214 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
michael@0 | 6215 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
michael@0 | 6216 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
michael@0 | 6217 | SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
michael@0 | 6218 | |
michael@0 | 6219 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
michael@0 | 6220 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6221 | SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6222 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
michael@0 | 6223 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6224 | SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6225 | |
michael@0 | 6226 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888), |
michael@0 | 6227 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888), |
michael@0 | 6228 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888), |
michael@0 | 6229 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888), |
michael@0 | 6230 | |
michael@0 | 6231 | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888), |
michael@0 | 6232 | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888), |
michael@0 | 6233 | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888), |
michael@0 | 6234 | SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888), |
michael@0 | 6235 | |
michael@0 | 6236 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888), |
michael@0 | 6237 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888), |
michael@0 | 6238 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888), |
michael@0 | 6239 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888), |
michael@0 | 6240 | |
michael@0 | 6241 | /* and here the needed entries are added to the fast path table */ |
michael@0 | 6242 | |
michael@0 | 6243 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, r5g6b5, sse2_8888_0565), |
michael@0 | 6244 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, b5g6r5, sse2_8888_0565), |
michael@0 | 6245 | |
michael@0 | 6246 | { PIXMAN_OP_NONE }, |
michael@0 | 6247 | }; |
michael@0 | 6248 | |
michael@0 | 6249 | static uint32_t * |
michael@0 | 6250 | sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
michael@0 | 6251 | { |
michael@0 | 6252 | int w = iter->width; |
michael@0 | 6253 | __m128i ff000000 = mask_ff000000; |
michael@0 | 6254 | uint32_t *dst = iter->buffer; |
michael@0 | 6255 | uint32_t *src = (uint32_t *)iter->bits; |
michael@0 | 6256 | |
michael@0 | 6257 | iter->bits += iter->stride; |
michael@0 | 6258 | |
michael@0 | 6259 | while (w && ((uintptr_t)dst) & 0x0f) |
michael@0 | 6260 | { |
michael@0 | 6261 | *dst++ = (*src++) | 0xff000000; |
michael@0 | 6262 | w--; |
michael@0 | 6263 | } |
michael@0 | 6264 | |
michael@0 | 6265 | while (w >= 4) |
michael@0 | 6266 | { |
michael@0 | 6267 | save_128_aligned ( |
michael@0 | 6268 | (__m128i *)dst, _mm_or_si128 ( |
michael@0 | 6269 | load_128_unaligned ((__m128i *)src), ff000000)); |
michael@0 | 6270 | |
michael@0 | 6271 | dst += 4; |
michael@0 | 6272 | src += 4; |
michael@0 | 6273 | w -= 4; |
michael@0 | 6274 | } |
michael@0 | 6275 | |
michael@0 | 6276 | while (w) |
michael@0 | 6277 | { |
michael@0 | 6278 | *dst++ = (*src++) | 0xff000000; |
michael@0 | 6279 | w--; |
michael@0 | 6280 | } |
michael@0 | 6281 | |
michael@0 | 6282 | return iter->buffer; |
michael@0 | 6283 | } |
michael@0 | 6284 | |
michael@0 | 6285 | static uint32_t * |
michael@0 | 6286 | sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
michael@0 | 6287 | { |
michael@0 | 6288 | int w = iter->width; |
michael@0 | 6289 | uint32_t *dst = iter->buffer; |
michael@0 | 6290 | uint16_t *src = (uint16_t *)iter->bits; |
michael@0 | 6291 | __m128i ff000000 = mask_ff000000; |
michael@0 | 6292 | |
michael@0 | 6293 | iter->bits += iter->stride; |
michael@0 | 6294 | |
michael@0 | 6295 | while (w && ((uintptr_t)dst) & 0x0f) |
michael@0 | 6296 | { |
michael@0 | 6297 | uint16_t s = *src++; |
michael@0 | 6298 | |
michael@0 | 6299 | *dst++ = convert_0565_to_8888 (s); |
michael@0 | 6300 | w--; |
michael@0 | 6301 | } |
michael@0 | 6302 | |
michael@0 | 6303 | while (w >= 8) |
michael@0 | 6304 | { |
michael@0 | 6305 | __m128i lo, hi, s; |
michael@0 | 6306 | |
michael@0 | 6307 | s = _mm_loadu_si128 ((__m128i *)src); |
michael@0 | 6308 | |
michael@0 | 6309 | lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ())); |
michael@0 | 6310 | hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ())); |
michael@0 | 6311 | |
michael@0 | 6312 | save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000)); |
michael@0 | 6313 | save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000)); |
michael@0 | 6314 | |
michael@0 | 6315 | dst += 8; |
michael@0 | 6316 | src += 8; |
michael@0 | 6317 | w -= 8; |
michael@0 | 6318 | } |
michael@0 | 6319 | |
michael@0 | 6320 | while (w) |
michael@0 | 6321 | { |
michael@0 | 6322 | uint16_t s = *src++; |
michael@0 | 6323 | |
michael@0 | 6324 | *dst++ = convert_0565_to_8888 (s); |
michael@0 | 6325 | w--; |
michael@0 | 6326 | } |
michael@0 | 6327 | |
michael@0 | 6328 | return iter->buffer; |
michael@0 | 6329 | } |
michael@0 | 6330 | |
michael@0 | 6331 | static uint32_t * |
michael@0 | 6332 | sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
michael@0 | 6333 | { |
michael@0 | 6334 | int w = iter->width; |
michael@0 | 6335 | uint32_t *dst = iter->buffer; |
michael@0 | 6336 | uint8_t *src = iter->bits; |
michael@0 | 6337 | __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6; |
michael@0 | 6338 | |
michael@0 | 6339 | iter->bits += iter->stride; |
michael@0 | 6340 | |
michael@0 | 6341 | while (w && (((uintptr_t)dst) & 15)) |
michael@0 | 6342 | { |
michael@0 | 6343 | *dst++ = *(src++) << 24; |
michael@0 | 6344 | w--; |
michael@0 | 6345 | } |
michael@0 | 6346 | |
michael@0 | 6347 | while (w >= 16) |
michael@0 | 6348 | { |
michael@0 | 6349 | xmm0 = _mm_loadu_si128((__m128i *)src); |
michael@0 | 6350 | |
michael@0 | 6351 | xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0); |
michael@0 | 6352 | xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0); |
michael@0 | 6353 | xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1); |
michael@0 | 6354 | xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1); |
michael@0 | 6355 | xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2); |
michael@0 | 6356 | xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2); |
michael@0 | 6357 | |
michael@0 | 6358 | _mm_store_si128(((__m128i *)(dst + 0)), xmm3); |
michael@0 | 6359 | _mm_store_si128(((__m128i *)(dst + 4)), xmm4); |
michael@0 | 6360 | _mm_store_si128(((__m128i *)(dst + 8)), xmm5); |
michael@0 | 6361 | _mm_store_si128(((__m128i *)(dst + 12)), xmm6); |
michael@0 | 6362 | |
michael@0 | 6363 | dst += 16; |
michael@0 | 6364 | src += 16; |
michael@0 | 6365 | w -= 16; |
michael@0 | 6366 | } |
michael@0 | 6367 | |
michael@0 | 6368 | while (w) |
michael@0 | 6369 | { |
michael@0 | 6370 | *dst++ = *(src++) << 24; |
michael@0 | 6371 | w--; |
michael@0 | 6372 | } |
michael@0 | 6373 | |
michael@0 | 6374 | return iter->buffer; |
michael@0 | 6375 | } |
michael@0 | 6376 | |
michael@0 | 6377 | typedef struct |
michael@0 | 6378 | { |
michael@0 | 6379 | pixman_format_code_t format; |
michael@0 | 6380 | pixman_iter_get_scanline_t get_scanline; |
michael@0 | 6381 | } fetcher_info_t; |
michael@0 | 6382 | |
michael@0 | 6383 | static const fetcher_info_t fetchers[] = |
michael@0 | 6384 | { |
michael@0 | 6385 | { PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 }, |
michael@0 | 6386 | { PIXMAN_r5g6b5, sse2_fetch_r5g6b5 }, |
michael@0 | 6387 | { PIXMAN_a8, sse2_fetch_a8 }, |
michael@0 | 6388 | { PIXMAN_null } |
michael@0 | 6389 | }; |
michael@0 | 6390 | |
michael@0 | 6391 | static pixman_bool_t |
michael@0 | 6392 | sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) |
michael@0 | 6393 | { |
michael@0 | 6394 | pixman_image_t *image = iter->image; |
michael@0 | 6395 | |
michael@0 | 6396 | #define FLAGS \ |
michael@0 | 6397 | (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
michael@0 | 6398 | FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
michael@0 | 6399 | |
michael@0 | 6400 | if ((iter->iter_flags & ITER_NARROW) && |
michael@0 | 6401 | (iter->image_flags & FLAGS) == FLAGS) |
michael@0 | 6402 | { |
michael@0 | 6403 | const fetcher_info_t *f; |
michael@0 | 6404 | |
michael@0 | 6405 | for (f = &fetchers[0]; f->format != PIXMAN_null; f++) |
michael@0 | 6406 | { |
michael@0 | 6407 | if (image->common.extended_format_code == f->format) |
michael@0 | 6408 | { |
michael@0 | 6409 | uint8_t *b = (uint8_t *)image->bits.bits; |
michael@0 | 6410 | int s = image->bits.rowstride * 4; |
michael@0 | 6411 | |
michael@0 | 6412 | iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; |
michael@0 | 6413 | iter->stride = s; |
michael@0 | 6414 | |
michael@0 | 6415 | iter->get_scanline = f->get_scanline; |
michael@0 | 6416 | return TRUE; |
michael@0 | 6417 | } |
michael@0 | 6418 | } |
michael@0 | 6419 | } |
michael@0 | 6420 | |
michael@0 | 6421 | return FALSE; |
michael@0 | 6422 | } |
michael@0 | 6423 | |
michael@0 | 6424 | #if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__) |
michael@0 | 6425 | __attribute__((__force_align_arg_pointer__)) |
michael@0 | 6426 | #endif |
michael@0 | 6427 | pixman_implementation_t * |
michael@0 | 6428 | _pixman_implementation_create_sse2 (pixman_implementation_t *fallback) |
michael@0 | 6429 | { |
michael@0 | 6430 | pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths); |
michael@0 | 6431 | |
michael@0 | 6432 | /* SSE2 constants */ |
michael@0 | 6433 | mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000); |
michael@0 | 6434 | mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000); |
michael@0 | 6435 | mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0); |
michael@0 | 6436 | mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f); |
michael@0 | 6437 | mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000); |
michael@0 | 6438 | mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00); |
michael@0 | 6439 | mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8); |
michael@0 | 6440 | mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0); |
michael@0 | 6441 | mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000); |
michael@0 | 6442 | mask_0080 = create_mask_16_128 (0x0080); |
michael@0 | 6443 | mask_00ff = create_mask_16_128 (0x00ff); |
michael@0 | 6444 | mask_0101 = create_mask_16_128 (0x0101); |
michael@0 | 6445 | mask_ffff = create_mask_16_128 (0xffff); |
michael@0 | 6446 | mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000); |
michael@0 | 6447 | mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000); |
michael@0 | 6448 | mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8); |
michael@0 | 6449 | mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004); |
michael@0 | 6450 | |
michael@0 | 6451 | /* Set up function pointers */ |
michael@0 | 6452 | imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u; |
michael@0 | 6453 | imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u; |
michael@0 | 6454 | imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u; |
michael@0 | 6455 | imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u; |
michael@0 | 6456 | imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u; |
michael@0 | 6457 | imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u; |
michael@0 | 6458 | imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u; |
michael@0 | 6459 | imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u; |
michael@0 | 6460 | imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u; |
michael@0 | 6461 | imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u; |
michael@0 | 6462 | |
michael@0 | 6463 | imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u; |
michael@0 | 6464 | |
michael@0 | 6465 | imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca; |
michael@0 | 6466 | imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca; |
michael@0 | 6467 | imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca; |
michael@0 | 6468 | imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca; |
michael@0 | 6469 | imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca; |
michael@0 | 6470 | imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca; |
michael@0 | 6471 | imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca; |
michael@0 | 6472 | imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca; |
michael@0 | 6473 | imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca; |
michael@0 | 6474 | imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca; |
michael@0 | 6475 | imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca; |
michael@0 | 6476 | |
michael@0 | 6477 | imp->blt = sse2_blt; |
michael@0 | 6478 | imp->fill = sse2_fill; |
michael@0 | 6479 | |
michael@0 | 6480 | imp->src_iter_init = sse2_src_iter_init; |
michael@0 | 6481 | |
michael@0 | 6482 | return imp; |
michael@0 | 6483 | } |