|
1 /* |
|
2 * Copyright © 2004, 2005 Red Hat, Inc. |
|
3 * Copyright © 2004 Nicholas Miell |
|
4 * Copyright © 2005 Trolltech AS |
|
5 * |
|
6 * Permission to use, copy, modify, distribute, and sell this software and its |
|
7 * documentation for any purpose is hereby granted without fee, provided that |
|
8 * the above copyright notice appear in all copies and that both that |
|
9 * copyright notice and this permission notice appear in supporting |
|
10 * documentation, and that the name of Red Hat not be used in advertising or |
|
11 * publicity pertaining to distribution of the software without specific, |
|
12 * written prior permission. Red Hat makes no representations about the |
|
13 * suitability of this software for any purpose. It is provided "as is" |
|
14 * without express or implied warranty. |
|
15 * |
|
16 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
|
17 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
|
18 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
|
19 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
|
20 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
|
21 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
|
22 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
|
23 * SOFTWARE. |
|
24 * |
|
25 * Author: Søren Sandmann (sandmann@redhat.com) |
|
26 * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
|
27 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) |
|
28 * |
|
29 * Based on work by Owen Taylor |
|
30 */ |
|
31 |
|
32 #ifdef HAVE_CONFIG_H |
|
33 #include <config.h> |
|
34 #endif |
|
35 |
|
36 #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI |
|
37 |
|
38 #ifdef USE_LOONGSON_MMI |
|
39 #include <loongson-mmintrin.h> |
|
40 #else |
|
41 #include <mmintrin.h> |
|
42 #endif |
|
43 #include "pixman-private.h" |
|
44 #include "pixman-combine32.h" |
|
45 #include "pixman-inlines.h" |
|
46 |
|
47 #define no_vERBOSE |
|
48 |
|
49 #ifdef VERBOSE |
|
50 #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) |
|
51 #else |
|
52 #define CHECKPOINT() |
|
53 #endif |
|
54 |
|
55 #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8 |
|
56 /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */ |
|
57 extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
|
58 _mm_empty (void) |
|
59 { |
|
60 |
|
61 } |
|
62 #endif |
|
63 |
|
64 #ifdef USE_X86_MMX |
|
65 # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) |
|
66 # include <xmmintrin.h> |
|
67 # else |
|
68 /* We have to compile with -msse to use xmmintrin.h, but that causes SSE |
|
69 * instructions to be generated that we don't want. Just duplicate the |
|
70 * functions we want to use. */ |
|
71 extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
|
72 _mm_movemask_pi8 (__m64 __A) |
|
73 { |
|
74 int ret; |
|
75 |
|
76 asm ("pmovmskb %1, %0\n\t" |
|
77 : "=r" (ret) |
|
78 : "y" (__A) |
|
79 ); |
|
80 |
|
81 return ret; |
|
82 } |
|
83 |
|
84 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
|
85 _mm_mulhi_pu16 (__m64 __A, __m64 __B) |
|
86 { |
|
87 asm ("pmulhuw %1, %0\n\t" |
|
88 : "+y" (__A) |
|
89 : "y" (__B) |
|
90 ); |
|
91 return __A; |
|
92 } |
|
93 |
|
94 # ifdef __OPTIMIZE__ |
|
95 extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
|
96 _mm_shuffle_pi16 (__m64 __A, int8_t const __N) |
|
97 { |
|
98 __m64 ret; |
|
99 |
|
100 asm ("pshufw %2, %1, %0\n\t" |
|
101 : "=y" (ret) |
|
102 : "y" (__A), "K" (__N) |
|
103 ); |
|
104 |
|
105 return ret; |
|
106 } |
|
107 # else |
|
108 # define _mm_shuffle_pi16(A, N) \ |
|
109 ({ \ |
|
110 __m64 ret; \ |
|
111 \ |
|
112 asm ("pshufw %2, %1, %0\n\t" \ |
|
113 : "=y" (ret) \ |
|
114 : "y" (A), "K" ((const int8_t)N) \ |
|
115 ); \ |
|
116 \ |
|
117 ret; \ |
|
118 }) |
|
119 # endif |
|
120 # endif |
|
121 #endif |
|
122 |
|
123 #ifndef _MSC_VER |
|
124 #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ |
|
125 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) |
|
126 #endif |
|
127 |
|
128 /* Notes about writing mmx code |
|
129 * |
|
130 * give memory operands as the second operand. If you give it as the |
|
131 * first, gcc will first load it into a register, then use that |
|
132 * register |
|
133 * |
|
134 * ie. use |
|
135 * |
|
136 * _mm_mullo_pi16 (x, mmx_constant); |
|
137 * |
|
138 * not |
|
139 * |
|
140 * _mm_mullo_pi16 (mmx_constant, x); |
|
141 * |
|
142 * Also try to minimize dependencies. i.e. when you need a value, try |
|
143 * to calculate it from a value that was calculated as early as |
|
144 * possible. |
|
145 */ |
|
146 |
|
147 /* --------------- MMX primitives ------------------------------------- */ |
|
148 |
|
149 /* If __m64 is defined as a struct or union, then define M64_MEMBER to be |
|
150 * the name of the member used to access the data. |
|
151 * If __m64 requires using mm_cvt* intrinsics functions to convert between |
|
152 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. |
|
153 * If __m64 and uint64_t values can just be cast to each other directly, |
|
154 * then define USE_M64_CASTS. |
|
155 * If __m64 is a double datatype, then define USE_M64_DOUBLE. |
|
156 */ |
|
157 #ifdef _MSC_VER |
|
158 # define M64_MEMBER m64_u64 |
|
159 #elif defined(__ICC) |
|
160 # define USE_CVT_INTRINSICS |
|
161 #elif defined(USE_LOONGSON_MMI) |
|
162 # define USE_M64_DOUBLE |
|
163 #elif defined(__GNUC__) |
|
164 # define USE_M64_CASTS |
|
165 #elif defined(__SUNPRO_C) |
|
166 # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) |
|
167 /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) |
|
168 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ |
|
169 * is defined. If it is used, then the mm_cvt* intrinsics must be used. |
|
170 */ |
|
171 # define USE_CVT_INTRINSICS |
|
172 # else |
|
173 /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is |
|
174 * disabled, __m64 is defined as a struct containing "unsigned long long l_". |
|
175 */ |
|
176 # define M64_MEMBER l_ |
|
177 # endif |
|
178 #endif |
|
179 |
|
180 #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) |
|
181 typedef uint64_t mmxdatafield; |
|
182 #else |
|
183 typedef __m64 mmxdatafield; |
|
184 #endif |
|
185 |
|
186 typedef struct |
|
187 { |
|
188 mmxdatafield mmx_4x00ff; |
|
189 mmxdatafield mmx_4x0080; |
|
190 mmxdatafield mmx_565_rgb; |
|
191 mmxdatafield mmx_565_unpack_multiplier; |
|
192 mmxdatafield mmx_565_pack_multiplier; |
|
193 mmxdatafield mmx_565_r; |
|
194 mmxdatafield mmx_565_g; |
|
195 mmxdatafield mmx_565_b; |
|
196 mmxdatafield mmx_packed_565_rb; |
|
197 mmxdatafield mmx_packed_565_g; |
|
198 mmxdatafield mmx_expand_565_g; |
|
199 mmxdatafield mmx_expand_565_b; |
|
200 mmxdatafield mmx_expand_565_r; |
|
201 #ifndef USE_LOONGSON_MMI |
|
202 mmxdatafield mmx_mask_0; |
|
203 mmxdatafield mmx_mask_1; |
|
204 mmxdatafield mmx_mask_2; |
|
205 mmxdatafield mmx_mask_3; |
|
206 #endif |
|
207 mmxdatafield mmx_full_alpha; |
|
208 mmxdatafield mmx_4x0101; |
|
209 mmxdatafield mmx_ff000000; |
|
210 } mmx_data_t; |
|
211 |
|
212 #if defined(_MSC_VER) |
|
213 # define MMXDATA_INIT(field, val) { val ## UI64 } |
|
214 #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ |
|
215 # define MMXDATA_INIT(field, val) field = { val ## ULL } |
|
216 #else /* mmxdatafield is an integral type */ |
|
217 # define MMXDATA_INIT(field, val) field = val ## ULL |
|
218 #endif |
|
219 |
|
220 static const mmx_data_t c = |
|
221 { |
|
222 MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), |
|
223 MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), |
|
224 MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), |
|
225 MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), |
|
226 MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), |
|
227 MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), |
|
228 MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), |
|
229 MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), |
|
230 MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), |
|
231 MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), |
|
232 MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), |
|
233 MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), |
|
234 MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), |
|
235 #ifndef USE_LOONGSON_MMI |
|
236 MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), |
|
237 MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), |
|
238 MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), |
|
239 MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), |
|
240 #endif |
|
241 MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), |
|
242 MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), |
|
243 MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), |
|
244 }; |
|
245 |
|
246 #ifdef USE_CVT_INTRINSICS |
|
247 # define MC(x) to_m64 (c.mmx_ ## x) |
|
248 #elif defined(USE_M64_CASTS) |
|
249 # define MC(x) ((__m64)c.mmx_ ## x) |
|
250 #elif defined(USE_M64_DOUBLE) |
|
251 # define MC(x) (*(__m64 *)&c.mmx_ ## x) |
|
252 #else |
|
253 # define MC(x) c.mmx_ ## x |
|
254 #endif |
|
255 |
|
256 static force_inline __m64 |
|
257 to_m64 (uint64_t x) |
|
258 { |
|
259 #ifdef USE_CVT_INTRINSICS |
|
260 return _mm_cvtsi64_m64 (x); |
|
261 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
|
262 __m64 res; |
|
263 |
|
264 res.M64_MEMBER = x; |
|
265 return res; |
|
266 #elif defined USE_M64_DOUBLE |
|
267 return *(__m64 *)&x; |
|
268 #else /* USE_M64_CASTS */ |
|
269 return (__m64)x; |
|
270 #endif |
|
271 } |
|
272 |
|
273 static force_inline uint64_t |
|
274 to_uint64 (__m64 x) |
|
275 { |
|
276 #ifdef USE_CVT_INTRINSICS |
|
277 return _mm_cvtm64_si64 (x); |
|
278 #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
|
279 uint64_t res = x.M64_MEMBER; |
|
280 return res; |
|
281 #elif defined USE_M64_DOUBLE |
|
282 return *(uint64_t *)&x; |
|
283 #else /* USE_M64_CASTS */ |
|
284 return (uint64_t)x; |
|
285 #endif |
|
286 } |
|
287 |
|
288 static force_inline __m64 |
|
289 shift (__m64 v, |
|
290 int s) |
|
291 { |
|
292 if (s > 0) |
|
293 return _mm_slli_si64 (v, s); |
|
294 else if (s < 0) |
|
295 return _mm_srli_si64 (v, -s); |
|
296 else |
|
297 return v; |
|
298 } |
|
299 |
|
300 static force_inline __m64 |
|
301 negate (__m64 mask) |
|
302 { |
|
303 return _mm_xor_si64 (mask, MC (4x00ff)); |
|
304 } |
|
305 |
|
306 static force_inline __m64 |
|
307 pix_multiply (__m64 a, __m64 b) |
|
308 { |
|
309 __m64 res; |
|
310 |
|
311 res = _mm_mullo_pi16 (a, b); |
|
312 res = _mm_adds_pu16 (res, MC (4x0080)); |
|
313 res = _mm_mulhi_pu16 (res, MC (4x0101)); |
|
314 |
|
315 return res; |
|
316 } |
|
317 |
|
318 static force_inline __m64 |
|
319 pix_add (__m64 a, __m64 b) |
|
320 { |
|
321 return _mm_adds_pu8 (a, b); |
|
322 } |
|
323 |
|
324 static force_inline __m64 |
|
325 expand_alpha (__m64 pixel) |
|
326 { |
|
327 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); |
|
328 } |
|
329 |
|
330 static force_inline __m64 |
|
331 expand_alpha_rev (__m64 pixel) |
|
332 { |
|
333 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); |
|
334 } |
|
335 |
|
336 static force_inline __m64 |
|
337 invert_colors (__m64 pixel) |
|
338 { |
|
339 return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); |
|
340 } |
|
341 |
|
342 static force_inline __m64 |
|
343 over (__m64 src, |
|
344 __m64 srca, |
|
345 __m64 dest) |
|
346 { |
|
347 return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); |
|
348 } |
|
349 |
|
350 static force_inline __m64 |
|
351 over_rev_non_pre (__m64 src, __m64 dest) |
|
352 { |
|
353 __m64 srca = expand_alpha (src); |
|
354 __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); |
|
355 |
|
356 return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); |
|
357 } |
|
358 |
|
359 static force_inline __m64 |
|
360 in (__m64 src, __m64 mask) |
|
361 { |
|
362 return pix_multiply (src, mask); |
|
363 } |
|
364 |
|
365 #ifndef _MSC_VER |
|
366 static force_inline __m64 |
|
367 in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) |
|
368 { |
|
369 return over (in (src, mask), pix_multiply (srca, mask), dest); |
|
370 } |
|
371 |
|
372 #else |
|
373 |
|
374 #define in_over(src, srca, mask, dest) \ |
|
375 over (in (src, mask), pix_multiply (srca, mask), dest) |
|
376 |
|
377 #endif |
|
378 |
|
379 /* Elemental unaligned loads */ |
|
380 |
|
381 static force_inline __m64 ldq_u(__m64 *p) |
|
382 { |
|
383 #ifdef USE_X86_MMX |
|
384 /* x86's alignment restrictions are very relaxed. */ |
|
385 return *(__m64 *)p; |
|
386 #elif defined USE_ARM_IWMMXT |
|
387 int align = (uintptr_t)p & 7; |
|
388 __m64 *aligned_p; |
|
389 if (align == 0) |
|
390 return *p; |
|
391 aligned_p = (__m64 *)((uintptr_t)p & ~7); |
|
392 return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); |
|
393 #else |
|
394 struct __una_u64 { __m64 x __attribute__((packed)); }; |
|
395 const struct __una_u64 *ptr = (const struct __una_u64 *) p; |
|
396 return (__m64) ptr->x; |
|
397 #endif |
|
398 } |
|
399 |
|
400 static force_inline uint32_t ldl_u(const uint32_t *p) |
|
401 { |
|
402 #ifdef USE_X86_MMX |
|
403 /* x86's alignment restrictions are very relaxed. */ |
|
404 return *p; |
|
405 #else |
|
406 struct __una_u32 { uint32_t x __attribute__((packed)); }; |
|
407 const struct __una_u32 *ptr = (const struct __una_u32 *) p; |
|
408 return ptr->x; |
|
409 #endif |
|
410 } |
|
411 |
|
412 static force_inline __m64 |
|
413 load (const uint32_t *v) |
|
414 { |
|
415 #ifdef USE_LOONGSON_MMI |
|
416 __m64 ret; |
|
417 asm ("lwc1 %0, %1\n\t" |
|
418 : "=f" (ret) |
|
419 : "m" (*v) |
|
420 ); |
|
421 return ret; |
|
422 #else |
|
423 return _mm_cvtsi32_si64 (*v); |
|
424 #endif |
|
425 } |
|
426 |
|
427 static force_inline __m64 |
|
428 load8888 (const uint32_t *v) |
|
429 { |
|
430 #ifdef USE_LOONGSON_MMI |
|
431 return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); |
|
432 #else |
|
433 return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); |
|
434 #endif |
|
435 } |
|
436 |
|
437 static force_inline __m64 |
|
438 load8888u (const uint32_t *v) |
|
439 { |
|
440 uint32_t l = ldl_u (v); |
|
441 return load8888 (&l); |
|
442 } |
|
443 |
|
444 static force_inline __m64 |
|
445 pack8888 (__m64 lo, __m64 hi) |
|
446 { |
|
447 return _mm_packs_pu16 (lo, hi); |
|
448 } |
|
449 |
|
450 static force_inline void |
|
451 store (uint32_t *dest, __m64 v) |
|
452 { |
|
453 #ifdef USE_LOONGSON_MMI |
|
454 asm ("swc1 %1, %0\n\t" |
|
455 : "=m" (*dest) |
|
456 : "f" (v) |
|
457 : "memory" |
|
458 ); |
|
459 #else |
|
460 *dest = _mm_cvtsi64_si32 (v); |
|
461 #endif |
|
462 } |
|
463 |
|
464 static force_inline void |
|
465 store8888 (uint32_t *dest, __m64 v) |
|
466 { |
|
467 v = pack8888 (v, _mm_setzero_si64 ()); |
|
468 store (dest, v); |
|
469 } |
|
470 |
|
471 static force_inline pixman_bool_t |
|
472 is_equal (__m64 a, __m64 b) |
|
473 { |
|
474 #ifdef USE_LOONGSON_MMI |
|
475 /* __m64 is double, we can compare directly. */ |
|
476 return a == b; |
|
477 #else |
|
478 return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; |
|
479 #endif |
|
480 } |
|
481 |
|
482 static force_inline pixman_bool_t |
|
483 is_opaque (__m64 v) |
|
484 { |
|
485 #ifdef USE_LOONGSON_MMI |
|
486 return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); |
|
487 #else |
|
488 __m64 ffs = _mm_cmpeq_pi8 (v, v); |
|
489 return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); |
|
490 #endif |
|
491 } |
|
492 |
|
493 static force_inline pixman_bool_t |
|
494 is_zero (__m64 v) |
|
495 { |
|
496 return is_equal (v, _mm_setzero_si64 ()); |
|
497 } |
|
498 |
|
499 /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
|
500 * |
|
501 * 00RR00GG00BB |
|
502 * |
|
503 * --- Expanding 565 in the low word --- |
|
504 * |
|
505 * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
|
506 * m = m & (01f0003f001f); |
|
507 * m = m * (008404100840); |
|
508 * m = m >> 8; |
|
509 * |
|
510 * Note the trick here - the top word is shifted by another nibble to |
|
511 * avoid it bumping into the middle word |
|
512 */ |
|
513 static force_inline __m64 |
|
514 expand565 (__m64 pixel, int pos) |
|
515 { |
|
516 __m64 p = pixel; |
|
517 __m64 t1, t2; |
|
518 |
|
519 /* move pixel to low 16 bit and zero the rest */ |
|
520 #ifdef USE_LOONGSON_MMI |
|
521 p = loongson_extract_pi16 (p, pos); |
|
522 #else |
|
523 p = shift (shift (p, (3 - pos) * 16), -48); |
|
524 #endif |
|
525 |
|
526 t1 = shift (p, 36 - 11); |
|
527 t2 = shift (p, 16 - 5); |
|
528 |
|
529 p = _mm_or_si64 (t1, p); |
|
530 p = _mm_or_si64 (t2, p); |
|
531 p = _mm_and_si64 (p, MC (565_rgb)); |
|
532 |
|
533 pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); |
|
534 return _mm_srli_pi16 (pixel, 8); |
|
535 } |
|
536 |
|
537 /* Expand 4 16 bit pixels in an mmx register into two mmx registers of |
|
538 * |
|
539 * AARRGGBBRRGGBB |
|
540 */ |
|
541 static force_inline void |
|
542 expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) |
|
543 { |
|
544 __m64 t0, t1, alpha = _mm_setzero_si64 (); |
|
545 __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); |
|
546 __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); |
|
547 __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); |
|
548 if (full_alpha) |
|
549 alpha = _mm_cmpeq_pi32 (alpha, alpha); |
|
550 |
|
551 /* Replicate high bits into empty low bits. */ |
|
552 r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); |
|
553 g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); |
|
554 b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); |
|
555 |
|
556 r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ |
|
557 g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ |
|
558 b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ |
|
559 |
|
560 t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ |
|
561 t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
|
562 |
|
563 *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ |
|
564 *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ |
|
565 } |
|
566 |
|
567 static force_inline __m64 |
|
568 expand8888 (__m64 in, int pos) |
|
569 { |
|
570 if (pos == 0) |
|
571 return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); |
|
572 else |
|
573 return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); |
|
574 } |
|
575 |
|
576 static force_inline __m64 |
|
577 expandx888 (__m64 in, int pos) |
|
578 { |
|
579 return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); |
|
580 } |
|
581 |
|
582 static force_inline void |
|
583 expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) |
|
584 { |
|
585 __m64 v0, v1; |
|
586 expand_4xpacked565 (vin, &v0, &v1, full_alpha); |
|
587 *vout0 = expand8888 (v0, 0); |
|
588 *vout1 = expand8888 (v0, 1); |
|
589 *vout2 = expand8888 (v1, 0); |
|
590 *vout3 = expand8888 (v1, 1); |
|
591 } |
|
592 |
|
593 static force_inline __m64 |
|
594 pack_565 (__m64 pixel, __m64 target, int pos) |
|
595 { |
|
596 __m64 p = pixel; |
|
597 __m64 t = target; |
|
598 __m64 r, g, b; |
|
599 |
|
600 r = _mm_and_si64 (p, MC (565_r)); |
|
601 g = _mm_and_si64 (p, MC (565_g)); |
|
602 b = _mm_and_si64 (p, MC (565_b)); |
|
603 |
|
604 #ifdef USE_LOONGSON_MMI |
|
605 r = shift (r, -(32 - 8)); |
|
606 g = shift (g, -(16 - 3)); |
|
607 b = shift (b, -(0 + 3)); |
|
608 |
|
609 p = _mm_or_si64 (r, g); |
|
610 p = _mm_or_si64 (p, b); |
|
611 return loongson_insert_pi16 (t, p, pos); |
|
612 #else |
|
613 r = shift (r, -(32 - 8) + pos * 16); |
|
614 g = shift (g, -(16 - 3) + pos * 16); |
|
615 b = shift (b, -(0 + 3) + pos * 16); |
|
616 |
|
617 if (pos == 0) |
|
618 t = _mm_and_si64 (t, MC (mask_0)); |
|
619 else if (pos == 1) |
|
620 t = _mm_and_si64 (t, MC (mask_1)); |
|
621 else if (pos == 2) |
|
622 t = _mm_and_si64 (t, MC (mask_2)); |
|
623 else if (pos == 3) |
|
624 t = _mm_and_si64 (t, MC (mask_3)); |
|
625 |
|
626 p = _mm_or_si64 (r, t); |
|
627 p = _mm_or_si64 (g, p); |
|
628 |
|
629 return _mm_or_si64 (b, p); |
|
630 #endif |
|
631 } |
|
632 |
|
633 static force_inline __m64 |
|
634 pack_4xpacked565 (__m64 a, __m64 b) |
|
635 { |
|
636 __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); |
|
637 __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); |
|
638 |
|
639 __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); |
|
640 __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); |
|
641 |
|
642 __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); |
|
643 __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); |
|
644 |
|
645 t0 = _mm_or_si64 (t0, g0); |
|
646 t1 = _mm_or_si64 (t1, g1); |
|
647 |
|
648 t0 = shift(t0, -5); |
|
649 #ifdef USE_ARM_IWMMXT |
|
650 t1 = shift(t1, -5); |
|
651 return _mm_packs_pu32 (t0, t1); |
|
652 #else |
|
653 t1 = shift(t1, -5 + 16); |
|
654 return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); |
|
655 #endif |
|
656 } |
|
657 |
|
658 #ifndef _MSC_VER |
|
659 |
|
660 static force_inline __m64 |
|
661 pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) |
|
662 { |
|
663 return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); |
|
664 } |
|
665 |
|
666 static force_inline __m64 |
|
667 pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) |
|
668 { |
|
669 x = pix_multiply (x, a); |
|
670 y = pix_multiply (y, b); |
|
671 |
|
672 return pix_add (x, y); |
|
673 } |
|
674 |
|
675 #else |
|
676 |
|
677 /* MSVC only handles a "pass by register" of up to three SSE intrinsics */ |
|
678 |
|
679 #define pack_4x565(v0, v1, v2, v3) \ |
|
680 pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) |
|
681 |
|
682 #define pix_add_mul(x, a, y, b) \ |
|
683 ( x = pix_multiply (x, a), \ |
|
684 y = pix_multiply (y, b), \ |
|
685 pix_add (x, y) ) |
|
686 |
|
687 #endif |
|
688 |
|
689 /* --------------- MMX code patch for fbcompose.c --------------------- */ |
|
690 |
|
691 static force_inline __m64 |
|
692 combine (const uint32_t *src, const uint32_t *mask) |
|
693 { |
|
694 __m64 vsrc = load8888 (src); |
|
695 |
|
696 if (mask) |
|
697 { |
|
698 __m64 m = load8888 (mask); |
|
699 |
|
700 m = expand_alpha (m); |
|
701 vsrc = pix_multiply (vsrc, m); |
|
702 } |
|
703 |
|
704 return vsrc; |
|
705 } |
|
706 |
|
707 static force_inline __m64 |
|
708 core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) |
|
709 { |
|
710 vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); |
|
711 |
|
712 if (is_opaque (vsrc)) |
|
713 { |
|
714 return vsrc; |
|
715 } |
|
716 else if (!is_zero (vsrc)) |
|
717 { |
|
718 return over (vsrc, expand_alpha (vsrc), |
|
719 _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); |
|
720 } |
|
721 |
|
722 return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); |
|
723 } |
|
724 |
|
725 static void |
|
726 mmx_combine_over_u (pixman_implementation_t *imp, |
|
727 pixman_op_t op, |
|
728 uint32_t * dest, |
|
729 const uint32_t * src, |
|
730 const uint32_t * mask, |
|
731 int width) |
|
732 { |
|
733 const uint32_t *end = dest + width; |
|
734 |
|
735 while (dest < end) |
|
736 { |
|
737 __m64 vsrc = combine (src, mask); |
|
738 |
|
739 if (is_opaque (vsrc)) |
|
740 { |
|
741 store8888 (dest, vsrc); |
|
742 } |
|
743 else if (!is_zero (vsrc)) |
|
744 { |
|
745 __m64 sa = expand_alpha (vsrc); |
|
746 store8888 (dest, over (vsrc, sa, load8888 (dest))); |
|
747 } |
|
748 |
|
749 ++dest; |
|
750 ++src; |
|
751 if (mask) |
|
752 ++mask; |
|
753 } |
|
754 _mm_empty (); |
|
755 } |
|
756 |
|
757 static void |
|
758 mmx_combine_over_reverse_u (pixman_implementation_t *imp, |
|
759 pixman_op_t op, |
|
760 uint32_t * dest, |
|
761 const uint32_t * src, |
|
762 const uint32_t * mask, |
|
763 int width) |
|
764 { |
|
765 const uint32_t *end = dest + width; |
|
766 |
|
767 while (dest < end) |
|
768 { |
|
769 __m64 d, da; |
|
770 __m64 s = combine (src, mask); |
|
771 |
|
772 d = load8888 (dest); |
|
773 da = expand_alpha (d); |
|
774 store8888 (dest, over (d, da, s)); |
|
775 |
|
776 ++dest; |
|
777 ++src; |
|
778 if (mask) |
|
779 mask++; |
|
780 } |
|
781 _mm_empty (); |
|
782 } |
|
783 |
|
784 static void |
|
785 mmx_combine_in_u (pixman_implementation_t *imp, |
|
786 pixman_op_t op, |
|
787 uint32_t * dest, |
|
788 const uint32_t * src, |
|
789 const uint32_t * mask, |
|
790 int width) |
|
791 { |
|
792 const uint32_t *end = dest + width; |
|
793 |
|
794 while (dest < end) |
|
795 { |
|
796 __m64 a; |
|
797 __m64 x = combine (src, mask); |
|
798 |
|
799 a = load8888 (dest); |
|
800 a = expand_alpha (a); |
|
801 x = pix_multiply (x, a); |
|
802 |
|
803 store8888 (dest, x); |
|
804 |
|
805 ++dest; |
|
806 ++src; |
|
807 if (mask) |
|
808 mask++; |
|
809 } |
|
810 _mm_empty (); |
|
811 } |
|
812 |
|
813 static void |
|
814 mmx_combine_in_reverse_u (pixman_implementation_t *imp, |
|
815 pixman_op_t op, |
|
816 uint32_t * dest, |
|
817 const uint32_t * src, |
|
818 const uint32_t * mask, |
|
819 int width) |
|
820 { |
|
821 const uint32_t *end = dest + width; |
|
822 |
|
823 while (dest < end) |
|
824 { |
|
825 __m64 a = combine (src, mask); |
|
826 __m64 x; |
|
827 |
|
828 x = load8888 (dest); |
|
829 a = expand_alpha (a); |
|
830 x = pix_multiply (x, a); |
|
831 store8888 (dest, x); |
|
832 |
|
833 ++dest; |
|
834 ++src; |
|
835 if (mask) |
|
836 mask++; |
|
837 } |
|
838 _mm_empty (); |
|
839 } |
|
840 |
|
841 static void |
|
842 mmx_combine_out_u (pixman_implementation_t *imp, |
|
843 pixman_op_t op, |
|
844 uint32_t * dest, |
|
845 const uint32_t * src, |
|
846 const uint32_t * mask, |
|
847 int width) |
|
848 { |
|
849 const uint32_t *end = dest + width; |
|
850 |
|
851 while (dest < end) |
|
852 { |
|
853 __m64 a; |
|
854 __m64 x = combine (src, mask); |
|
855 |
|
856 a = load8888 (dest); |
|
857 a = expand_alpha (a); |
|
858 a = negate (a); |
|
859 x = pix_multiply (x, a); |
|
860 store8888 (dest, x); |
|
861 |
|
862 ++dest; |
|
863 ++src; |
|
864 if (mask) |
|
865 mask++; |
|
866 } |
|
867 _mm_empty (); |
|
868 } |
|
869 |
|
870 static void |
|
871 mmx_combine_out_reverse_u (pixman_implementation_t *imp, |
|
872 pixman_op_t op, |
|
873 uint32_t * dest, |
|
874 const uint32_t * src, |
|
875 const uint32_t * mask, |
|
876 int width) |
|
877 { |
|
878 const uint32_t *end = dest + width; |
|
879 |
|
880 while (dest < end) |
|
881 { |
|
882 __m64 a = combine (src, mask); |
|
883 __m64 x; |
|
884 |
|
885 x = load8888 (dest); |
|
886 a = expand_alpha (a); |
|
887 a = negate (a); |
|
888 x = pix_multiply (x, a); |
|
889 |
|
890 store8888 (dest, x); |
|
891 |
|
892 ++dest; |
|
893 ++src; |
|
894 if (mask) |
|
895 mask++; |
|
896 } |
|
897 _mm_empty (); |
|
898 } |
|
899 |
|
900 static void |
|
901 mmx_combine_atop_u (pixman_implementation_t *imp, |
|
902 pixman_op_t op, |
|
903 uint32_t * dest, |
|
904 const uint32_t * src, |
|
905 const uint32_t * mask, |
|
906 int width) |
|
907 { |
|
908 const uint32_t *end = dest + width; |
|
909 |
|
910 while (dest < end) |
|
911 { |
|
912 __m64 da, d, sia; |
|
913 __m64 s = combine (src, mask); |
|
914 |
|
915 d = load8888 (dest); |
|
916 sia = expand_alpha (s); |
|
917 sia = negate (sia); |
|
918 da = expand_alpha (d); |
|
919 s = pix_add_mul (s, da, d, sia); |
|
920 store8888 (dest, s); |
|
921 |
|
922 ++dest; |
|
923 ++src; |
|
924 if (mask) |
|
925 mask++; |
|
926 } |
|
927 _mm_empty (); |
|
928 } |
|
929 |
|
930 static void |
|
931 mmx_combine_atop_reverse_u (pixman_implementation_t *imp, |
|
932 pixman_op_t op, |
|
933 uint32_t * dest, |
|
934 const uint32_t * src, |
|
935 const uint32_t * mask, |
|
936 int width) |
|
937 { |
|
938 const uint32_t *end; |
|
939 |
|
940 end = dest + width; |
|
941 |
|
942 while (dest < end) |
|
943 { |
|
944 __m64 dia, d, sa; |
|
945 __m64 s = combine (src, mask); |
|
946 |
|
947 d = load8888 (dest); |
|
948 sa = expand_alpha (s); |
|
949 dia = expand_alpha (d); |
|
950 dia = negate (dia); |
|
951 s = pix_add_mul (s, dia, d, sa); |
|
952 store8888 (dest, s); |
|
953 |
|
954 ++dest; |
|
955 ++src; |
|
956 if (mask) |
|
957 mask++; |
|
958 } |
|
959 _mm_empty (); |
|
960 } |
|
961 |
|
962 static void |
|
963 mmx_combine_xor_u (pixman_implementation_t *imp, |
|
964 pixman_op_t op, |
|
965 uint32_t * dest, |
|
966 const uint32_t * src, |
|
967 const uint32_t * mask, |
|
968 int width) |
|
969 { |
|
970 const uint32_t *end = dest + width; |
|
971 |
|
972 while (dest < end) |
|
973 { |
|
974 __m64 dia, d, sia; |
|
975 __m64 s = combine (src, mask); |
|
976 |
|
977 d = load8888 (dest); |
|
978 sia = expand_alpha (s); |
|
979 dia = expand_alpha (d); |
|
980 sia = negate (sia); |
|
981 dia = negate (dia); |
|
982 s = pix_add_mul (s, dia, d, sia); |
|
983 store8888 (dest, s); |
|
984 |
|
985 ++dest; |
|
986 ++src; |
|
987 if (mask) |
|
988 mask++; |
|
989 } |
|
990 _mm_empty (); |
|
991 } |
|
992 |
|
993 static void |
|
994 mmx_combine_add_u (pixman_implementation_t *imp, |
|
995 pixman_op_t op, |
|
996 uint32_t * dest, |
|
997 const uint32_t * src, |
|
998 const uint32_t * mask, |
|
999 int width) |
|
1000 { |
|
1001 const uint32_t *end = dest + width; |
|
1002 |
|
1003 while (dest < end) |
|
1004 { |
|
1005 __m64 d; |
|
1006 __m64 s = combine (src, mask); |
|
1007 |
|
1008 d = load8888 (dest); |
|
1009 s = pix_add (s, d); |
|
1010 store8888 (dest, s); |
|
1011 |
|
1012 ++dest; |
|
1013 ++src; |
|
1014 if (mask) |
|
1015 mask++; |
|
1016 } |
|
1017 _mm_empty (); |
|
1018 } |
|
1019 |
|
1020 static void |
|
1021 mmx_combine_saturate_u (pixman_implementation_t *imp, |
|
1022 pixman_op_t op, |
|
1023 uint32_t * dest, |
|
1024 const uint32_t * src, |
|
1025 const uint32_t * mask, |
|
1026 int width) |
|
1027 { |
|
1028 const uint32_t *end = dest + width; |
|
1029 |
|
1030 while (dest < end) |
|
1031 { |
|
1032 uint32_t s, sa, da; |
|
1033 uint32_t d = *dest; |
|
1034 __m64 ms = combine (src, mask); |
|
1035 __m64 md = load8888 (dest); |
|
1036 |
|
1037 store8888(&s, ms); |
|
1038 da = ~d >> 24; |
|
1039 sa = s >> 24; |
|
1040 |
|
1041 if (sa > da) |
|
1042 { |
|
1043 uint32_t quot = DIV_UN8 (da, sa) << 24; |
|
1044 __m64 msa = load8888 ("); |
|
1045 msa = expand_alpha (msa); |
|
1046 ms = pix_multiply (ms, msa); |
|
1047 } |
|
1048 |
|
1049 md = pix_add (md, ms); |
|
1050 store8888 (dest, md); |
|
1051 |
|
1052 ++src; |
|
1053 ++dest; |
|
1054 if (mask) |
|
1055 mask++; |
|
1056 } |
|
1057 _mm_empty (); |
|
1058 } |
|
1059 |
|
1060 static void |
|
1061 mmx_combine_src_ca (pixman_implementation_t *imp, |
|
1062 pixman_op_t op, |
|
1063 uint32_t * dest, |
|
1064 const uint32_t * src, |
|
1065 const uint32_t * mask, |
|
1066 int width) |
|
1067 { |
|
1068 const uint32_t *end = src + width; |
|
1069 |
|
1070 while (src < end) |
|
1071 { |
|
1072 __m64 a = load8888 (mask); |
|
1073 __m64 s = load8888 (src); |
|
1074 |
|
1075 s = pix_multiply (s, a); |
|
1076 store8888 (dest, s); |
|
1077 |
|
1078 ++src; |
|
1079 ++mask; |
|
1080 ++dest; |
|
1081 } |
|
1082 _mm_empty (); |
|
1083 } |
|
1084 |
|
1085 static void |
|
1086 mmx_combine_over_ca (pixman_implementation_t *imp, |
|
1087 pixman_op_t op, |
|
1088 uint32_t * dest, |
|
1089 const uint32_t * src, |
|
1090 const uint32_t * mask, |
|
1091 int width) |
|
1092 { |
|
1093 const uint32_t *end = src + width; |
|
1094 |
|
1095 while (src < end) |
|
1096 { |
|
1097 __m64 a = load8888 (mask); |
|
1098 __m64 s = load8888 (src); |
|
1099 __m64 d = load8888 (dest); |
|
1100 __m64 sa = expand_alpha (s); |
|
1101 |
|
1102 store8888 (dest, in_over (s, sa, a, d)); |
|
1103 |
|
1104 ++src; |
|
1105 ++dest; |
|
1106 ++mask; |
|
1107 } |
|
1108 _mm_empty (); |
|
1109 } |
|
1110 |
|
1111 static void |
|
1112 mmx_combine_over_reverse_ca (pixman_implementation_t *imp, |
|
1113 pixman_op_t op, |
|
1114 uint32_t * dest, |
|
1115 const uint32_t * src, |
|
1116 const uint32_t * mask, |
|
1117 int width) |
|
1118 { |
|
1119 const uint32_t *end = src + width; |
|
1120 |
|
1121 while (src < end) |
|
1122 { |
|
1123 __m64 a = load8888 (mask); |
|
1124 __m64 s = load8888 (src); |
|
1125 __m64 d = load8888 (dest); |
|
1126 __m64 da = expand_alpha (d); |
|
1127 |
|
1128 store8888 (dest, over (d, da, in (s, a))); |
|
1129 |
|
1130 ++src; |
|
1131 ++dest; |
|
1132 ++mask; |
|
1133 } |
|
1134 _mm_empty (); |
|
1135 } |
|
1136 |
|
1137 static void |
|
1138 mmx_combine_in_ca (pixman_implementation_t *imp, |
|
1139 pixman_op_t op, |
|
1140 uint32_t * dest, |
|
1141 const uint32_t * src, |
|
1142 const uint32_t * mask, |
|
1143 int width) |
|
1144 { |
|
1145 const uint32_t *end = src + width; |
|
1146 |
|
1147 while (src < end) |
|
1148 { |
|
1149 __m64 a = load8888 (mask); |
|
1150 __m64 s = load8888 (src); |
|
1151 __m64 d = load8888 (dest); |
|
1152 __m64 da = expand_alpha (d); |
|
1153 |
|
1154 s = pix_multiply (s, a); |
|
1155 s = pix_multiply (s, da); |
|
1156 store8888 (dest, s); |
|
1157 |
|
1158 ++src; |
|
1159 ++dest; |
|
1160 ++mask; |
|
1161 } |
|
1162 _mm_empty (); |
|
1163 } |
|
1164 |
|
1165 static void |
|
1166 mmx_combine_in_reverse_ca (pixman_implementation_t *imp, |
|
1167 pixman_op_t op, |
|
1168 uint32_t * dest, |
|
1169 const uint32_t * src, |
|
1170 const uint32_t * mask, |
|
1171 int width) |
|
1172 { |
|
1173 const uint32_t *end = src + width; |
|
1174 |
|
1175 while (src < end) |
|
1176 { |
|
1177 __m64 a = load8888 (mask); |
|
1178 __m64 s = load8888 (src); |
|
1179 __m64 d = load8888 (dest); |
|
1180 __m64 sa = expand_alpha (s); |
|
1181 |
|
1182 a = pix_multiply (a, sa); |
|
1183 d = pix_multiply (d, a); |
|
1184 store8888 (dest, d); |
|
1185 |
|
1186 ++src; |
|
1187 ++dest; |
|
1188 ++mask; |
|
1189 } |
|
1190 _mm_empty (); |
|
1191 } |
|
1192 |
|
1193 static void |
|
1194 mmx_combine_out_ca (pixman_implementation_t *imp, |
|
1195 pixman_op_t op, |
|
1196 uint32_t * dest, |
|
1197 const uint32_t * src, |
|
1198 const uint32_t * mask, |
|
1199 int width) |
|
1200 { |
|
1201 const uint32_t *end = src + width; |
|
1202 |
|
1203 while (src < end) |
|
1204 { |
|
1205 __m64 a = load8888 (mask); |
|
1206 __m64 s = load8888 (src); |
|
1207 __m64 d = load8888 (dest); |
|
1208 __m64 da = expand_alpha (d); |
|
1209 |
|
1210 da = negate (da); |
|
1211 s = pix_multiply (s, a); |
|
1212 s = pix_multiply (s, da); |
|
1213 store8888 (dest, s); |
|
1214 |
|
1215 ++src; |
|
1216 ++dest; |
|
1217 ++mask; |
|
1218 } |
|
1219 _mm_empty (); |
|
1220 } |
|
1221 |
|
1222 static void |
|
1223 mmx_combine_out_reverse_ca (pixman_implementation_t *imp, |
|
1224 pixman_op_t op, |
|
1225 uint32_t * dest, |
|
1226 const uint32_t * src, |
|
1227 const uint32_t * mask, |
|
1228 int width) |
|
1229 { |
|
1230 const uint32_t *end = src + width; |
|
1231 |
|
1232 while (src < end) |
|
1233 { |
|
1234 __m64 a = load8888 (mask); |
|
1235 __m64 s = load8888 (src); |
|
1236 __m64 d = load8888 (dest); |
|
1237 __m64 sa = expand_alpha (s); |
|
1238 |
|
1239 a = pix_multiply (a, sa); |
|
1240 a = negate (a); |
|
1241 d = pix_multiply (d, a); |
|
1242 store8888 (dest, d); |
|
1243 |
|
1244 ++src; |
|
1245 ++dest; |
|
1246 ++mask; |
|
1247 } |
|
1248 _mm_empty (); |
|
1249 } |
|
1250 |
|
1251 static void |
|
1252 mmx_combine_atop_ca (pixman_implementation_t *imp, |
|
1253 pixman_op_t op, |
|
1254 uint32_t * dest, |
|
1255 const uint32_t * src, |
|
1256 const uint32_t * mask, |
|
1257 int width) |
|
1258 { |
|
1259 const uint32_t *end = src + width; |
|
1260 |
|
1261 while (src < end) |
|
1262 { |
|
1263 __m64 a = load8888 (mask); |
|
1264 __m64 s = load8888 (src); |
|
1265 __m64 d = load8888 (dest); |
|
1266 __m64 da = expand_alpha (d); |
|
1267 __m64 sa = expand_alpha (s); |
|
1268 |
|
1269 s = pix_multiply (s, a); |
|
1270 a = pix_multiply (a, sa); |
|
1271 a = negate (a); |
|
1272 d = pix_add_mul (d, a, s, da); |
|
1273 store8888 (dest, d); |
|
1274 |
|
1275 ++src; |
|
1276 ++dest; |
|
1277 ++mask; |
|
1278 } |
|
1279 _mm_empty (); |
|
1280 } |
|
1281 |
|
1282 static void |
|
1283 mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, |
|
1284 pixman_op_t op, |
|
1285 uint32_t * dest, |
|
1286 const uint32_t * src, |
|
1287 const uint32_t * mask, |
|
1288 int width) |
|
1289 { |
|
1290 const uint32_t *end = src + width; |
|
1291 |
|
1292 while (src < end) |
|
1293 { |
|
1294 __m64 a = load8888 (mask); |
|
1295 __m64 s = load8888 (src); |
|
1296 __m64 d = load8888 (dest); |
|
1297 __m64 da = expand_alpha (d); |
|
1298 __m64 sa = expand_alpha (s); |
|
1299 |
|
1300 s = pix_multiply (s, a); |
|
1301 a = pix_multiply (a, sa); |
|
1302 da = negate (da); |
|
1303 d = pix_add_mul (d, a, s, da); |
|
1304 store8888 (dest, d); |
|
1305 |
|
1306 ++src; |
|
1307 ++dest; |
|
1308 ++mask; |
|
1309 } |
|
1310 _mm_empty (); |
|
1311 } |
|
1312 |
|
1313 static void |
|
1314 mmx_combine_xor_ca (pixman_implementation_t *imp, |
|
1315 pixman_op_t op, |
|
1316 uint32_t * dest, |
|
1317 const uint32_t * src, |
|
1318 const uint32_t * mask, |
|
1319 int width) |
|
1320 { |
|
1321 const uint32_t *end = src + width; |
|
1322 |
|
1323 while (src < end) |
|
1324 { |
|
1325 __m64 a = load8888 (mask); |
|
1326 __m64 s = load8888 (src); |
|
1327 __m64 d = load8888 (dest); |
|
1328 __m64 da = expand_alpha (d); |
|
1329 __m64 sa = expand_alpha (s); |
|
1330 |
|
1331 s = pix_multiply (s, a); |
|
1332 a = pix_multiply (a, sa); |
|
1333 da = negate (da); |
|
1334 a = negate (a); |
|
1335 d = pix_add_mul (d, a, s, da); |
|
1336 store8888 (dest, d); |
|
1337 |
|
1338 ++src; |
|
1339 ++dest; |
|
1340 ++mask; |
|
1341 } |
|
1342 _mm_empty (); |
|
1343 } |
|
1344 |
|
1345 static void |
|
1346 mmx_combine_add_ca (pixman_implementation_t *imp, |
|
1347 pixman_op_t op, |
|
1348 uint32_t * dest, |
|
1349 const uint32_t * src, |
|
1350 const uint32_t * mask, |
|
1351 int width) |
|
1352 { |
|
1353 const uint32_t *end = src + width; |
|
1354 |
|
1355 while (src < end) |
|
1356 { |
|
1357 __m64 a = load8888 (mask); |
|
1358 __m64 s = load8888 (src); |
|
1359 __m64 d = load8888 (dest); |
|
1360 |
|
1361 s = pix_multiply (s, a); |
|
1362 d = pix_add (s, d); |
|
1363 store8888 (dest, d); |
|
1364 |
|
1365 ++src; |
|
1366 ++dest; |
|
1367 ++mask; |
|
1368 } |
|
1369 _mm_empty (); |
|
1370 } |
|
1371 |
|
1372 /* ------------- MMX code paths called from fbpict.c -------------------- */ |
|
1373 |
|
1374 static void |
|
1375 mmx_composite_over_n_8888 (pixman_implementation_t *imp, |
|
1376 pixman_composite_info_t *info) |
|
1377 { |
|
1378 PIXMAN_COMPOSITE_ARGS (info); |
|
1379 uint32_t src; |
|
1380 uint32_t *dst_line, *dst; |
|
1381 int32_t w; |
|
1382 int dst_stride; |
|
1383 __m64 vsrc, vsrca; |
|
1384 |
|
1385 CHECKPOINT (); |
|
1386 |
|
1387 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
1388 |
|
1389 if (src == 0) |
|
1390 return; |
|
1391 |
|
1392 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
1393 |
|
1394 vsrc = load8888 (&src); |
|
1395 vsrca = expand_alpha (vsrc); |
|
1396 |
|
1397 while (height--) |
|
1398 { |
|
1399 dst = dst_line; |
|
1400 dst_line += dst_stride; |
|
1401 w = width; |
|
1402 |
|
1403 CHECKPOINT (); |
|
1404 |
|
1405 while (w && (uintptr_t)dst & 7) |
|
1406 { |
|
1407 store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
|
1408 |
|
1409 w--; |
|
1410 dst++; |
|
1411 } |
|
1412 |
|
1413 while (w >= 2) |
|
1414 { |
|
1415 __m64 vdest; |
|
1416 __m64 dest0, dest1; |
|
1417 |
|
1418 vdest = *(__m64 *)dst; |
|
1419 |
|
1420 dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); |
|
1421 dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); |
|
1422 |
|
1423 *(__m64 *)dst = pack8888 (dest0, dest1); |
|
1424 |
|
1425 dst += 2; |
|
1426 w -= 2; |
|
1427 } |
|
1428 |
|
1429 CHECKPOINT (); |
|
1430 |
|
1431 if (w) |
|
1432 { |
|
1433 store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
|
1434 } |
|
1435 } |
|
1436 |
|
1437 _mm_empty (); |
|
1438 } |
|
1439 |
|
1440 static void |
|
1441 mmx_composite_over_n_0565 (pixman_implementation_t *imp, |
|
1442 pixman_composite_info_t *info) |
|
1443 { |
|
1444 PIXMAN_COMPOSITE_ARGS (info); |
|
1445 uint32_t src; |
|
1446 uint16_t *dst_line, *dst; |
|
1447 int32_t w; |
|
1448 int dst_stride; |
|
1449 __m64 vsrc, vsrca; |
|
1450 |
|
1451 CHECKPOINT (); |
|
1452 |
|
1453 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
1454 |
|
1455 if (src == 0) |
|
1456 return; |
|
1457 |
|
1458 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
1459 |
|
1460 vsrc = load8888 (&src); |
|
1461 vsrca = expand_alpha (vsrc); |
|
1462 |
|
1463 while (height--) |
|
1464 { |
|
1465 dst = dst_line; |
|
1466 dst_line += dst_stride; |
|
1467 w = width; |
|
1468 |
|
1469 CHECKPOINT (); |
|
1470 |
|
1471 while (w && (uintptr_t)dst & 7) |
|
1472 { |
|
1473 uint64_t d = *dst; |
|
1474 __m64 vdest = expand565 (to_m64 (d), 0); |
|
1475 |
|
1476 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
|
1477 *dst = to_uint64 (vdest); |
|
1478 |
|
1479 w--; |
|
1480 dst++; |
|
1481 } |
|
1482 |
|
1483 while (w >= 4) |
|
1484 { |
|
1485 __m64 vdest = *(__m64 *)dst; |
|
1486 __m64 v0, v1, v2, v3; |
|
1487 |
|
1488 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
|
1489 |
|
1490 v0 = over (vsrc, vsrca, v0); |
|
1491 v1 = over (vsrc, vsrca, v1); |
|
1492 v2 = over (vsrc, vsrca, v2); |
|
1493 v3 = over (vsrc, vsrca, v3); |
|
1494 |
|
1495 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
|
1496 |
|
1497 dst += 4; |
|
1498 w -= 4; |
|
1499 } |
|
1500 |
|
1501 CHECKPOINT (); |
|
1502 |
|
1503 while (w) |
|
1504 { |
|
1505 uint64_t d = *dst; |
|
1506 __m64 vdest = expand565 (to_m64 (d), 0); |
|
1507 |
|
1508 vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
|
1509 *dst = to_uint64 (vdest); |
|
1510 |
|
1511 w--; |
|
1512 dst++; |
|
1513 } |
|
1514 } |
|
1515 |
|
1516 _mm_empty (); |
|
1517 } |
|
1518 |
|
1519 static void |
|
1520 mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
|
1521 pixman_composite_info_t *info) |
|
1522 { |
|
1523 PIXMAN_COMPOSITE_ARGS (info); |
|
1524 uint32_t src; |
|
1525 uint32_t *dst_line; |
|
1526 uint32_t *mask_line; |
|
1527 int dst_stride, mask_stride; |
|
1528 __m64 vsrc, vsrca; |
|
1529 |
|
1530 CHECKPOINT (); |
|
1531 |
|
1532 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
1533 |
|
1534 if (src == 0) |
|
1535 return; |
|
1536 |
|
1537 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
1538 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
|
1539 |
|
1540 vsrc = load8888 (&src); |
|
1541 vsrca = expand_alpha (vsrc); |
|
1542 |
|
1543 while (height--) |
|
1544 { |
|
1545 int twidth = width; |
|
1546 uint32_t *p = (uint32_t *)mask_line; |
|
1547 uint32_t *q = (uint32_t *)dst_line; |
|
1548 |
|
1549 while (twidth && (uintptr_t)q & 7) |
|
1550 { |
|
1551 uint32_t m = *(uint32_t *)p; |
|
1552 |
|
1553 if (m) |
|
1554 { |
|
1555 __m64 vdest = load8888 (q); |
|
1556 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
|
1557 store8888 (q, vdest); |
|
1558 } |
|
1559 |
|
1560 twidth--; |
|
1561 p++; |
|
1562 q++; |
|
1563 } |
|
1564 |
|
1565 while (twidth >= 2) |
|
1566 { |
|
1567 uint32_t m0, m1; |
|
1568 m0 = *p; |
|
1569 m1 = *(p + 1); |
|
1570 |
|
1571 if (m0 | m1) |
|
1572 { |
|
1573 __m64 dest0, dest1; |
|
1574 __m64 vdest = *(__m64 *)q; |
|
1575 |
|
1576 dest0 = in_over (vsrc, vsrca, load8888 (&m0), |
|
1577 expand8888 (vdest, 0)); |
|
1578 dest1 = in_over (vsrc, vsrca, load8888 (&m1), |
|
1579 expand8888 (vdest, 1)); |
|
1580 |
|
1581 *(__m64 *)q = pack8888 (dest0, dest1); |
|
1582 } |
|
1583 |
|
1584 p += 2; |
|
1585 q += 2; |
|
1586 twidth -= 2; |
|
1587 } |
|
1588 |
|
1589 if (twidth) |
|
1590 { |
|
1591 uint32_t m = *(uint32_t *)p; |
|
1592 |
|
1593 if (m) |
|
1594 { |
|
1595 __m64 vdest = load8888 (q); |
|
1596 vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
|
1597 store8888 (q, vdest); |
|
1598 } |
|
1599 |
|
1600 twidth--; |
|
1601 p++; |
|
1602 q++; |
|
1603 } |
|
1604 |
|
1605 dst_line += dst_stride; |
|
1606 mask_line += mask_stride; |
|
1607 } |
|
1608 |
|
1609 _mm_empty (); |
|
1610 } |
|
1611 |
|
1612 static void |
|
1613 mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
|
1614 pixman_composite_info_t *info) |
|
1615 { |
|
1616 PIXMAN_COMPOSITE_ARGS (info); |
|
1617 uint32_t *dst_line, *dst; |
|
1618 uint32_t *src_line, *src; |
|
1619 uint32_t mask; |
|
1620 __m64 vmask; |
|
1621 int dst_stride, src_stride; |
|
1622 int32_t w; |
|
1623 |
|
1624 CHECKPOINT (); |
|
1625 |
|
1626 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
1627 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
1628 |
|
1629 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
|
1630 vmask = expand_alpha (load8888 (&mask)); |
|
1631 |
|
1632 while (height--) |
|
1633 { |
|
1634 dst = dst_line; |
|
1635 dst_line += dst_stride; |
|
1636 src = src_line; |
|
1637 src_line += src_stride; |
|
1638 w = width; |
|
1639 |
|
1640 while (w && (uintptr_t)dst & 7) |
|
1641 { |
|
1642 __m64 s = load8888 (src); |
|
1643 __m64 d = load8888 (dst); |
|
1644 |
|
1645 store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
|
1646 |
|
1647 w--; |
|
1648 dst++; |
|
1649 src++; |
|
1650 } |
|
1651 |
|
1652 while (w >= 2) |
|
1653 { |
|
1654 __m64 vs = ldq_u ((__m64 *)src); |
|
1655 __m64 vd = *(__m64 *)dst; |
|
1656 __m64 vsrc0 = expand8888 (vs, 0); |
|
1657 __m64 vsrc1 = expand8888 (vs, 1); |
|
1658 |
|
1659 *(__m64 *)dst = pack8888 ( |
|
1660 in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), |
|
1661 in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); |
|
1662 |
|
1663 w -= 2; |
|
1664 dst += 2; |
|
1665 src += 2; |
|
1666 } |
|
1667 |
|
1668 if (w) |
|
1669 { |
|
1670 __m64 s = load8888 (src); |
|
1671 __m64 d = load8888 (dst); |
|
1672 |
|
1673 store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
|
1674 } |
|
1675 } |
|
1676 |
|
1677 _mm_empty (); |
|
1678 } |
|
1679 |
|
1680 static void |
|
1681 mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
|
1682 pixman_composite_info_t *info) |
|
1683 { |
|
1684 PIXMAN_COMPOSITE_ARGS (info); |
|
1685 uint32_t *dst_line, *dst; |
|
1686 uint32_t *src_line, *src; |
|
1687 uint32_t mask; |
|
1688 __m64 vmask; |
|
1689 int dst_stride, src_stride; |
|
1690 int32_t w; |
|
1691 __m64 srca; |
|
1692 |
|
1693 CHECKPOINT (); |
|
1694 |
|
1695 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
1696 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
1697 mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
|
1698 |
|
1699 vmask = expand_alpha (load8888 (&mask)); |
|
1700 srca = MC (4x00ff); |
|
1701 |
|
1702 while (height--) |
|
1703 { |
|
1704 dst = dst_line; |
|
1705 dst_line += dst_stride; |
|
1706 src = src_line; |
|
1707 src_line += src_stride; |
|
1708 w = width; |
|
1709 |
|
1710 while (w && (uintptr_t)dst & 7) |
|
1711 { |
|
1712 uint32_t ssrc = *src | 0xff000000; |
|
1713 __m64 s = load8888 (&ssrc); |
|
1714 __m64 d = load8888 (dst); |
|
1715 |
|
1716 store8888 (dst, in_over (s, srca, vmask, d)); |
|
1717 |
|
1718 w--; |
|
1719 dst++; |
|
1720 src++; |
|
1721 } |
|
1722 |
|
1723 while (w >= 16) |
|
1724 { |
|
1725 __m64 vd0 = *(__m64 *)(dst + 0); |
|
1726 __m64 vd1 = *(__m64 *)(dst + 2); |
|
1727 __m64 vd2 = *(__m64 *)(dst + 4); |
|
1728 __m64 vd3 = *(__m64 *)(dst + 6); |
|
1729 __m64 vd4 = *(__m64 *)(dst + 8); |
|
1730 __m64 vd5 = *(__m64 *)(dst + 10); |
|
1731 __m64 vd6 = *(__m64 *)(dst + 12); |
|
1732 __m64 vd7 = *(__m64 *)(dst + 14); |
|
1733 |
|
1734 __m64 vs0 = ldq_u ((__m64 *)(src + 0)); |
|
1735 __m64 vs1 = ldq_u ((__m64 *)(src + 2)); |
|
1736 __m64 vs2 = ldq_u ((__m64 *)(src + 4)); |
|
1737 __m64 vs3 = ldq_u ((__m64 *)(src + 6)); |
|
1738 __m64 vs4 = ldq_u ((__m64 *)(src + 8)); |
|
1739 __m64 vs5 = ldq_u ((__m64 *)(src + 10)); |
|
1740 __m64 vs6 = ldq_u ((__m64 *)(src + 12)); |
|
1741 __m64 vs7 = ldq_u ((__m64 *)(src + 14)); |
|
1742 |
|
1743 vd0 = pack8888 ( |
|
1744 in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
|
1745 in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
|
1746 |
|
1747 vd1 = pack8888 ( |
|
1748 in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
|
1749 in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
|
1750 |
|
1751 vd2 = pack8888 ( |
|
1752 in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
|
1753 in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
|
1754 |
|
1755 vd3 = pack8888 ( |
|
1756 in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
|
1757 in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
|
1758 |
|
1759 vd4 = pack8888 ( |
|
1760 in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
|
1761 in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
|
1762 |
|
1763 vd5 = pack8888 ( |
|
1764 in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
|
1765 in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
|
1766 |
|
1767 vd6 = pack8888 ( |
|
1768 in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
|
1769 in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
|
1770 |
|
1771 vd7 = pack8888 ( |
|
1772 in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
|
1773 in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
|
1774 |
|
1775 *(__m64 *)(dst + 0) = vd0; |
|
1776 *(__m64 *)(dst + 2) = vd1; |
|
1777 *(__m64 *)(dst + 4) = vd2; |
|
1778 *(__m64 *)(dst + 6) = vd3; |
|
1779 *(__m64 *)(dst + 8) = vd4; |
|
1780 *(__m64 *)(dst + 10) = vd5; |
|
1781 *(__m64 *)(dst + 12) = vd6; |
|
1782 *(__m64 *)(dst + 14) = vd7; |
|
1783 |
|
1784 w -= 16; |
|
1785 dst += 16; |
|
1786 src += 16; |
|
1787 } |
|
1788 |
|
1789 while (w) |
|
1790 { |
|
1791 uint32_t ssrc = *src | 0xff000000; |
|
1792 __m64 s = load8888 (&ssrc); |
|
1793 __m64 d = load8888 (dst); |
|
1794 |
|
1795 store8888 (dst, in_over (s, srca, vmask, d)); |
|
1796 |
|
1797 w--; |
|
1798 dst++; |
|
1799 src++; |
|
1800 } |
|
1801 } |
|
1802 |
|
1803 _mm_empty (); |
|
1804 } |
|
1805 |
|
1806 static void |
|
1807 mmx_composite_over_8888_8888 (pixman_implementation_t *imp, |
|
1808 pixman_composite_info_t *info) |
|
1809 { |
|
1810 PIXMAN_COMPOSITE_ARGS (info); |
|
1811 uint32_t *dst_line, *dst; |
|
1812 uint32_t *src_line, *src; |
|
1813 uint32_t s; |
|
1814 int dst_stride, src_stride; |
|
1815 uint8_t a; |
|
1816 int32_t w; |
|
1817 |
|
1818 CHECKPOINT (); |
|
1819 |
|
1820 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
1821 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
1822 |
|
1823 while (height--) |
|
1824 { |
|
1825 dst = dst_line; |
|
1826 dst_line += dst_stride; |
|
1827 src = src_line; |
|
1828 src_line += src_stride; |
|
1829 w = width; |
|
1830 |
|
1831 while (w--) |
|
1832 { |
|
1833 s = *src++; |
|
1834 a = s >> 24; |
|
1835 |
|
1836 if (a == 0xff) |
|
1837 { |
|
1838 *dst = s; |
|
1839 } |
|
1840 else if (s) |
|
1841 { |
|
1842 __m64 ms, sa; |
|
1843 ms = load8888 (&s); |
|
1844 sa = expand_alpha (ms); |
|
1845 store8888 (dst, over (ms, sa, load8888 (dst))); |
|
1846 } |
|
1847 |
|
1848 dst++; |
|
1849 } |
|
1850 } |
|
1851 _mm_empty (); |
|
1852 } |
|
1853 |
|
1854 static void |
|
1855 mmx_composite_over_8888_0565 (pixman_implementation_t *imp, |
|
1856 pixman_composite_info_t *info) |
|
1857 { |
|
1858 PIXMAN_COMPOSITE_ARGS (info); |
|
1859 uint16_t *dst_line, *dst; |
|
1860 uint32_t *src_line, *src; |
|
1861 int dst_stride, src_stride; |
|
1862 int32_t w; |
|
1863 |
|
1864 CHECKPOINT (); |
|
1865 |
|
1866 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
1867 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
1868 |
|
1869 #if 0 |
|
1870 /* FIXME */ |
|
1871 assert (src_image->drawable == mask_image->drawable); |
|
1872 #endif |
|
1873 |
|
1874 while (height--) |
|
1875 { |
|
1876 dst = dst_line; |
|
1877 dst_line += dst_stride; |
|
1878 src = src_line; |
|
1879 src_line += src_stride; |
|
1880 w = width; |
|
1881 |
|
1882 CHECKPOINT (); |
|
1883 |
|
1884 while (w && (uintptr_t)dst & 7) |
|
1885 { |
|
1886 __m64 vsrc = load8888 (src); |
|
1887 uint64_t d = *dst; |
|
1888 __m64 vdest = expand565 (to_m64 (d), 0); |
|
1889 |
|
1890 vdest = pack_565 ( |
|
1891 over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
|
1892 |
|
1893 *dst = to_uint64 (vdest); |
|
1894 |
|
1895 w--; |
|
1896 dst++; |
|
1897 src++; |
|
1898 } |
|
1899 |
|
1900 CHECKPOINT (); |
|
1901 |
|
1902 while (w >= 4) |
|
1903 { |
|
1904 __m64 vdest = *(__m64 *)dst; |
|
1905 __m64 v0, v1, v2, v3; |
|
1906 __m64 vsrc0, vsrc1, vsrc2, vsrc3; |
|
1907 |
|
1908 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
|
1909 |
|
1910 vsrc0 = load8888 ((src + 0)); |
|
1911 vsrc1 = load8888 ((src + 1)); |
|
1912 vsrc2 = load8888 ((src + 2)); |
|
1913 vsrc3 = load8888 ((src + 3)); |
|
1914 |
|
1915 v0 = over (vsrc0, expand_alpha (vsrc0), v0); |
|
1916 v1 = over (vsrc1, expand_alpha (vsrc1), v1); |
|
1917 v2 = over (vsrc2, expand_alpha (vsrc2), v2); |
|
1918 v3 = over (vsrc3, expand_alpha (vsrc3), v3); |
|
1919 |
|
1920 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
|
1921 |
|
1922 w -= 4; |
|
1923 dst += 4; |
|
1924 src += 4; |
|
1925 } |
|
1926 |
|
1927 CHECKPOINT (); |
|
1928 |
|
1929 while (w) |
|
1930 { |
|
1931 __m64 vsrc = load8888 (src); |
|
1932 uint64_t d = *dst; |
|
1933 __m64 vdest = expand565 (to_m64 (d), 0); |
|
1934 |
|
1935 vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
|
1936 |
|
1937 *dst = to_uint64 (vdest); |
|
1938 |
|
1939 w--; |
|
1940 dst++; |
|
1941 src++; |
|
1942 } |
|
1943 } |
|
1944 |
|
1945 _mm_empty (); |
|
1946 } |
|
1947 |
|
1948 static void |
|
1949 mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, |
|
1950 pixman_composite_info_t *info) |
|
1951 { |
|
1952 PIXMAN_COMPOSITE_ARGS (info); |
|
1953 uint32_t src, srca; |
|
1954 uint32_t *dst_line, *dst; |
|
1955 uint8_t *mask_line, *mask; |
|
1956 int dst_stride, mask_stride; |
|
1957 int32_t w; |
|
1958 __m64 vsrc, vsrca; |
|
1959 uint64_t srcsrc; |
|
1960 |
|
1961 CHECKPOINT (); |
|
1962 |
|
1963 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
1964 |
|
1965 srca = src >> 24; |
|
1966 if (src == 0) |
|
1967 return; |
|
1968 |
|
1969 srcsrc = (uint64_t)src << 32 | src; |
|
1970 |
|
1971 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
1972 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
1973 |
|
1974 vsrc = load8888 (&src); |
|
1975 vsrca = expand_alpha (vsrc); |
|
1976 |
|
1977 while (height--) |
|
1978 { |
|
1979 dst = dst_line; |
|
1980 dst_line += dst_stride; |
|
1981 mask = mask_line; |
|
1982 mask_line += mask_stride; |
|
1983 w = width; |
|
1984 |
|
1985 CHECKPOINT (); |
|
1986 |
|
1987 while (w && (uintptr_t)dst & 7) |
|
1988 { |
|
1989 uint64_t m = *mask; |
|
1990 |
|
1991 if (m) |
|
1992 { |
|
1993 __m64 vdest = in_over (vsrc, vsrca, |
|
1994 expand_alpha_rev (to_m64 (m)), |
|
1995 load8888 (dst)); |
|
1996 |
|
1997 store8888 (dst, vdest); |
|
1998 } |
|
1999 |
|
2000 w--; |
|
2001 mask++; |
|
2002 dst++; |
|
2003 } |
|
2004 |
|
2005 CHECKPOINT (); |
|
2006 |
|
2007 while (w >= 2) |
|
2008 { |
|
2009 uint64_t m0, m1; |
|
2010 |
|
2011 m0 = *mask; |
|
2012 m1 = *(mask + 1); |
|
2013 |
|
2014 if (srca == 0xff && (m0 & m1) == 0xff) |
|
2015 { |
|
2016 *(uint64_t *)dst = srcsrc; |
|
2017 } |
|
2018 else if (m0 | m1) |
|
2019 { |
|
2020 __m64 vdest; |
|
2021 __m64 dest0, dest1; |
|
2022 |
|
2023 vdest = *(__m64 *)dst; |
|
2024 |
|
2025 dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), |
|
2026 expand8888 (vdest, 0)); |
|
2027 dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), |
|
2028 expand8888 (vdest, 1)); |
|
2029 |
|
2030 *(__m64 *)dst = pack8888 (dest0, dest1); |
|
2031 } |
|
2032 |
|
2033 mask += 2; |
|
2034 dst += 2; |
|
2035 w -= 2; |
|
2036 } |
|
2037 |
|
2038 CHECKPOINT (); |
|
2039 |
|
2040 if (w) |
|
2041 { |
|
2042 uint64_t m = *mask; |
|
2043 |
|
2044 if (m) |
|
2045 { |
|
2046 __m64 vdest = load8888 (dst); |
|
2047 |
|
2048 vdest = in_over ( |
|
2049 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); |
|
2050 store8888 (dst, vdest); |
|
2051 } |
|
2052 } |
|
2053 } |
|
2054 |
|
2055 _mm_empty (); |
|
2056 } |
|
2057 |
|
2058 static pixman_bool_t |
|
2059 mmx_fill (pixman_implementation_t *imp, |
|
2060 uint32_t * bits, |
|
2061 int stride, |
|
2062 int bpp, |
|
2063 int x, |
|
2064 int y, |
|
2065 int width, |
|
2066 int height, |
|
2067 uint32_t filler) |
|
2068 { |
|
2069 uint64_t fill; |
|
2070 __m64 vfill; |
|
2071 uint32_t byte_width; |
|
2072 uint8_t *byte_line; |
|
2073 |
|
2074 #if defined __GNUC__ && defined USE_X86_MMX |
|
2075 __m64 v1, v2, v3, v4, v5, v6, v7; |
|
2076 #endif |
|
2077 |
|
2078 if (bpp != 16 && bpp != 32 && bpp != 8) |
|
2079 return FALSE; |
|
2080 |
|
2081 if (bpp == 8) |
|
2082 { |
|
2083 stride = stride * (int) sizeof (uint32_t) / 1; |
|
2084 byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
|
2085 byte_width = width; |
|
2086 stride *= 1; |
|
2087 filler = (filler & 0xff) * 0x01010101; |
|
2088 } |
|
2089 else if (bpp == 16) |
|
2090 { |
|
2091 stride = stride * (int) sizeof (uint32_t) / 2; |
|
2092 byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
|
2093 byte_width = 2 * width; |
|
2094 stride *= 2; |
|
2095 filler = (filler & 0xffff) * 0x00010001; |
|
2096 } |
|
2097 else |
|
2098 { |
|
2099 stride = stride * (int) sizeof (uint32_t) / 4; |
|
2100 byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
|
2101 byte_width = 4 * width; |
|
2102 stride *= 4; |
|
2103 } |
|
2104 |
|
2105 fill = ((uint64_t)filler << 32) | filler; |
|
2106 vfill = to_m64 (fill); |
|
2107 |
|
2108 #if defined __GNUC__ && defined USE_X86_MMX |
|
2109 __asm__ ( |
|
2110 "movq %7, %0\n" |
|
2111 "movq %7, %1\n" |
|
2112 "movq %7, %2\n" |
|
2113 "movq %7, %3\n" |
|
2114 "movq %7, %4\n" |
|
2115 "movq %7, %5\n" |
|
2116 "movq %7, %6\n" |
|
2117 : "=&y" (v1), "=&y" (v2), "=&y" (v3), |
|
2118 "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) |
|
2119 : "y" (vfill)); |
|
2120 #endif |
|
2121 |
|
2122 while (height--) |
|
2123 { |
|
2124 int w; |
|
2125 uint8_t *d = byte_line; |
|
2126 |
|
2127 byte_line += stride; |
|
2128 w = byte_width; |
|
2129 |
|
2130 if (w >= 1 && ((uintptr_t)d & 1)) |
|
2131 { |
|
2132 *(uint8_t *)d = (filler & 0xff); |
|
2133 w--; |
|
2134 d++; |
|
2135 } |
|
2136 |
|
2137 if (w >= 2 && ((uintptr_t)d & 3)) |
|
2138 { |
|
2139 *(uint16_t *)d = filler; |
|
2140 w -= 2; |
|
2141 d += 2; |
|
2142 } |
|
2143 |
|
2144 while (w >= 4 && ((uintptr_t)d & 7)) |
|
2145 { |
|
2146 *(uint32_t *)d = filler; |
|
2147 |
|
2148 w -= 4; |
|
2149 d += 4; |
|
2150 } |
|
2151 |
|
2152 while (w >= 64) |
|
2153 { |
|
2154 #if defined __GNUC__ && defined USE_X86_MMX |
|
2155 __asm__ ( |
|
2156 "movq %1, (%0)\n" |
|
2157 "movq %2, 8(%0)\n" |
|
2158 "movq %3, 16(%0)\n" |
|
2159 "movq %4, 24(%0)\n" |
|
2160 "movq %5, 32(%0)\n" |
|
2161 "movq %6, 40(%0)\n" |
|
2162 "movq %7, 48(%0)\n" |
|
2163 "movq %8, 56(%0)\n" |
|
2164 : |
|
2165 : "r" (d), |
|
2166 "y" (vfill), "y" (v1), "y" (v2), "y" (v3), |
|
2167 "y" (v4), "y" (v5), "y" (v6), "y" (v7) |
|
2168 : "memory"); |
|
2169 #else |
|
2170 *(__m64*) (d + 0) = vfill; |
|
2171 *(__m64*) (d + 8) = vfill; |
|
2172 *(__m64*) (d + 16) = vfill; |
|
2173 *(__m64*) (d + 24) = vfill; |
|
2174 *(__m64*) (d + 32) = vfill; |
|
2175 *(__m64*) (d + 40) = vfill; |
|
2176 *(__m64*) (d + 48) = vfill; |
|
2177 *(__m64*) (d + 56) = vfill; |
|
2178 #endif |
|
2179 w -= 64; |
|
2180 d += 64; |
|
2181 } |
|
2182 |
|
2183 while (w >= 4) |
|
2184 { |
|
2185 *(uint32_t *)d = filler; |
|
2186 |
|
2187 w -= 4; |
|
2188 d += 4; |
|
2189 } |
|
2190 if (w >= 2) |
|
2191 { |
|
2192 *(uint16_t *)d = filler; |
|
2193 w -= 2; |
|
2194 d += 2; |
|
2195 } |
|
2196 if (w >= 1) |
|
2197 { |
|
2198 *(uint8_t *)d = (filler & 0xff); |
|
2199 w--; |
|
2200 d++; |
|
2201 } |
|
2202 |
|
2203 } |
|
2204 |
|
2205 _mm_empty (); |
|
2206 return TRUE; |
|
2207 } |
|
2208 |
|
2209 static void |
|
2210 mmx_composite_src_x888_0565 (pixman_implementation_t *imp, |
|
2211 pixman_composite_info_t *info) |
|
2212 { |
|
2213 PIXMAN_COMPOSITE_ARGS (info); |
|
2214 uint16_t *dst_line, *dst; |
|
2215 uint32_t *src_line, *src, s; |
|
2216 int dst_stride, src_stride; |
|
2217 int32_t w; |
|
2218 |
|
2219 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2220 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
2221 |
|
2222 while (height--) |
|
2223 { |
|
2224 dst = dst_line; |
|
2225 dst_line += dst_stride; |
|
2226 src = src_line; |
|
2227 src_line += src_stride; |
|
2228 w = width; |
|
2229 |
|
2230 while (w && (uintptr_t)dst & 7) |
|
2231 { |
|
2232 s = *src++; |
|
2233 *dst = convert_8888_to_0565 (s); |
|
2234 dst++; |
|
2235 w--; |
|
2236 } |
|
2237 |
|
2238 while (w >= 4) |
|
2239 { |
|
2240 __m64 vdest; |
|
2241 __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); |
|
2242 __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); |
|
2243 |
|
2244 vdest = pack_4xpacked565 (vsrc0, vsrc1); |
|
2245 |
|
2246 *(__m64 *)dst = vdest; |
|
2247 |
|
2248 w -= 4; |
|
2249 src += 4; |
|
2250 dst += 4; |
|
2251 } |
|
2252 |
|
2253 while (w) |
|
2254 { |
|
2255 s = *src++; |
|
2256 *dst = convert_8888_to_0565 (s); |
|
2257 dst++; |
|
2258 w--; |
|
2259 } |
|
2260 } |
|
2261 |
|
2262 _mm_empty (); |
|
2263 } |
|
2264 |
|
2265 static void |
|
2266 mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, |
|
2267 pixman_composite_info_t *info) |
|
2268 { |
|
2269 PIXMAN_COMPOSITE_ARGS (info); |
|
2270 uint32_t src, srca; |
|
2271 uint32_t *dst_line, *dst; |
|
2272 uint8_t *mask_line, *mask; |
|
2273 int dst_stride, mask_stride; |
|
2274 int32_t w; |
|
2275 __m64 vsrc; |
|
2276 uint64_t srcsrc; |
|
2277 |
|
2278 CHECKPOINT (); |
|
2279 |
|
2280 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2281 |
|
2282 srca = src >> 24; |
|
2283 if (src == 0) |
|
2284 { |
|
2285 mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
|
2286 PIXMAN_FORMAT_BPP (dest_image->bits.format), |
|
2287 dest_x, dest_y, width, height, 0); |
|
2288 return; |
|
2289 } |
|
2290 |
|
2291 srcsrc = (uint64_t)src << 32 | src; |
|
2292 |
|
2293 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2294 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
2295 |
|
2296 vsrc = load8888 (&src); |
|
2297 |
|
2298 while (height--) |
|
2299 { |
|
2300 dst = dst_line; |
|
2301 dst_line += dst_stride; |
|
2302 mask = mask_line; |
|
2303 mask_line += mask_stride; |
|
2304 w = width; |
|
2305 |
|
2306 CHECKPOINT (); |
|
2307 |
|
2308 while (w && (uintptr_t)dst & 7) |
|
2309 { |
|
2310 uint64_t m = *mask; |
|
2311 |
|
2312 if (m) |
|
2313 { |
|
2314 __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
|
2315 |
|
2316 store8888 (dst, vdest); |
|
2317 } |
|
2318 else |
|
2319 { |
|
2320 *dst = 0; |
|
2321 } |
|
2322 |
|
2323 w--; |
|
2324 mask++; |
|
2325 dst++; |
|
2326 } |
|
2327 |
|
2328 CHECKPOINT (); |
|
2329 |
|
2330 while (w >= 2) |
|
2331 { |
|
2332 uint64_t m0, m1; |
|
2333 m0 = *mask; |
|
2334 m1 = *(mask + 1); |
|
2335 |
|
2336 if (srca == 0xff && (m0 & m1) == 0xff) |
|
2337 { |
|
2338 *(uint64_t *)dst = srcsrc; |
|
2339 } |
|
2340 else if (m0 | m1) |
|
2341 { |
|
2342 __m64 dest0, dest1; |
|
2343 |
|
2344 dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); |
|
2345 dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); |
|
2346 |
|
2347 *(__m64 *)dst = pack8888 (dest0, dest1); |
|
2348 } |
|
2349 else |
|
2350 { |
|
2351 *(uint64_t *)dst = 0; |
|
2352 } |
|
2353 |
|
2354 mask += 2; |
|
2355 dst += 2; |
|
2356 w -= 2; |
|
2357 } |
|
2358 |
|
2359 CHECKPOINT (); |
|
2360 |
|
2361 if (w) |
|
2362 { |
|
2363 uint64_t m = *mask; |
|
2364 |
|
2365 if (m) |
|
2366 { |
|
2367 __m64 vdest = load8888 (dst); |
|
2368 |
|
2369 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
|
2370 store8888 (dst, vdest); |
|
2371 } |
|
2372 else |
|
2373 { |
|
2374 *dst = 0; |
|
2375 } |
|
2376 } |
|
2377 } |
|
2378 |
|
2379 _mm_empty (); |
|
2380 } |
|
2381 |
|
2382 static void |
|
2383 mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, |
|
2384 pixman_composite_info_t *info) |
|
2385 { |
|
2386 PIXMAN_COMPOSITE_ARGS (info); |
|
2387 uint32_t src, srca; |
|
2388 uint16_t *dst_line, *dst; |
|
2389 uint8_t *mask_line, *mask; |
|
2390 int dst_stride, mask_stride; |
|
2391 int32_t w; |
|
2392 __m64 vsrc, vsrca, tmp; |
|
2393 __m64 srcsrcsrcsrc; |
|
2394 |
|
2395 CHECKPOINT (); |
|
2396 |
|
2397 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2398 |
|
2399 srca = src >> 24; |
|
2400 if (src == 0) |
|
2401 return; |
|
2402 |
|
2403 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
2404 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
2405 |
|
2406 vsrc = load8888 (&src); |
|
2407 vsrca = expand_alpha (vsrc); |
|
2408 |
|
2409 tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); |
|
2410 srcsrcsrcsrc = expand_alpha_rev (tmp); |
|
2411 |
|
2412 while (height--) |
|
2413 { |
|
2414 dst = dst_line; |
|
2415 dst_line += dst_stride; |
|
2416 mask = mask_line; |
|
2417 mask_line += mask_stride; |
|
2418 w = width; |
|
2419 |
|
2420 CHECKPOINT (); |
|
2421 |
|
2422 while (w && (uintptr_t)dst & 7) |
|
2423 { |
|
2424 uint64_t m = *mask; |
|
2425 |
|
2426 if (m) |
|
2427 { |
|
2428 uint64_t d = *dst; |
|
2429 __m64 vd = to_m64 (d); |
|
2430 __m64 vdest = in_over ( |
|
2431 vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); |
|
2432 |
|
2433 vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
|
2434 *dst = to_uint64 (vd); |
|
2435 } |
|
2436 |
|
2437 w--; |
|
2438 mask++; |
|
2439 dst++; |
|
2440 } |
|
2441 |
|
2442 CHECKPOINT (); |
|
2443 |
|
2444 while (w >= 4) |
|
2445 { |
|
2446 uint64_t m0, m1, m2, m3; |
|
2447 m0 = *mask; |
|
2448 m1 = *(mask + 1); |
|
2449 m2 = *(mask + 2); |
|
2450 m3 = *(mask + 3); |
|
2451 |
|
2452 if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
|
2453 { |
|
2454 *(__m64 *)dst = srcsrcsrcsrc; |
|
2455 } |
|
2456 else if (m0 | m1 | m2 | m3) |
|
2457 { |
|
2458 __m64 vdest = *(__m64 *)dst; |
|
2459 __m64 v0, v1, v2, v3; |
|
2460 __m64 vm0, vm1, vm2, vm3; |
|
2461 |
|
2462 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
|
2463 |
|
2464 vm0 = to_m64 (m0); |
|
2465 v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); |
|
2466 |
|
2467 vm1 = to_m64 (m1); |
|
2468 v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); |
|
2469 |
|
2470 vm2 = to_m64 (m2); |
|
2471 v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); |
|
2472 |
|
2473 vm3 = to_m64 (m3); |
|
2474 v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); |
|
2475 |
|
2476 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; |
|
2477 } |
|
2478 |
|
2479 w -= 4; |
|
2480 mask += 4; |
|
2481 dst += 4; |
|
2482 } |
|
2483 |
|
2484 CHECKPOINT (); |
|
2485 |
|
2486 while (w) |
|
2487 { |
|
2488 uint64_t m = *mask; |
|
2489 |
|
2490 if (m) |
|
2491 { |
|
2492 uint64_t d = *dst; |
|
2493 __m64 vd = to_m64 (d); |
|
2494 __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), |
|
2495 expand565 (vd, 0)); |
|
2496 vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
|
2497 *dst = to_uint64 (vd); |
|
2498 } |
|
2499 |
|
2500 w--; |
|
2501 mask++; |
|
2502 dst++; |
|
2503 } |
|
2504 } |
|
2505 |
|
2506 _mm_empty (); |
|
2507 } |
|
2508 |
|
2509 static void |
|
2510 mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
|
2511 pixman_composite_info_t *info) |
|
2512 { |
|
2513 PIXMAN_COMPOSITE_ARGS (info); |
|
2514 uint16_t *dst_line, *dst; |
|
2515 uint32_t *src_line, *src; |
|
2516 int dst_stride, src_stride; |
|
2517 int32_t w; |
|
2518 |
|
2519 CHECKPOINT (); |
|
2520 |
|
2521 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
2522 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2523 |
|
2524 #if 0 |
|
2525 /* FIXME */ |
|
2526 assert (src_image->drawable == mask_image->drawable); |
|
2527 #endif |
|
2528 |
|
2529 while (height--) |
|
2530 { |
|
2531 dst = dst_line; |
|
2532 dst_line += dst_stride; |
|
2533 src = src_line; |
|
2534 src_line += src_stride; |
|
2535 w = width; |
|
2536 |
|
2537 CHECKPOINT (); |
|
2538 |
|
2539 while (w && (uintptr_t)dst & 7) |
|
2540 { |
|
2541 __m64 vsrc = load8888 (src); |
|
2542 uint64_t d = *dst; |
|
2543 __m64 vdest = expand565 (to_m64 (d), 0); |
|
2544 |
|
2545 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
|
2546 |
|
2547 *dst = to_uint64 (vdest); |
|
2548 |
|
2549 w--; |
|
2550 dst++; |
|
2551 src++; |
|
2552 } |
|
2553 |
|
2554 CHECKPOINT (); |
|
2555 |
|
2556 while (w >= 4) |
|
2557 { |
|
2558 uint32_t s0, s1, s2, s3; |
|
2559 unsigned char a0, a1, a2, a3; |
|
2560 |
|
2561 s0 = *src; |
|
2562 s1 = *(src + 1); |
|
2563 s2 = *(src + 2); |
|
2564 s3 = *(src + 3); |
|
2565 |
|
2566 a0 = (s0 >> 24); |
|
2567 a1 = (s1 >> 24); |
|
2568 a2 = (s2 >> 24); |
|
2569 a3 = (s3 >> 24); |
|
2570 |
|
2571 if ((a0 & a1 & a2 & a3) == 0xFF) |
|
2572 { |
|
2573 __m64 v0 = invert_colors (load8888 (&s0)); |
|
2574 __m64 v1 = invert_colors (load8888 (&s1)); |
|
2575 __m64 v2 = invert_colors (load8888 (&s2)); |
|
2576 __m64 v3 = invert_colors (load8888 (&s3)); |
|
2577 |
|
2578 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
|
2579 } |
|
2580 else if (s0 | s1 | s2 | s3) |
|
2581 { |
|
2582 __m64 vdest = *(__m64 *)dst; |
|
2583 __m64 v0, v1, v2, v3; |
|
2584 |
|
2585 __m64 vsrc0 = load8888 (&s0); |
|
2586 __m64 vsrc1 = load8888 (&s1); |
|
2587 __m64 vsrc2 = load8888 (&s2); |
|
2588 __m64 vsrc3 = load8888 (&s3); |
|
2589 |
|
2590 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
|
2591 |
|
2592 v0 = over_rev_non_pre (vsrc0, v0); |
|
2593 v1 = over_rev_non_pre (vsrc1, v1); |
|
2594 v2 = over_rev_non_pre (vsrc2, v2); |
|
2595 v3 = over_rev_non_pre (vsrc3, v3); |
|
2596 |
|
2597 *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
|
2598 } |
|
2599 |
|
2600 w -= 4; |
|
2601 dst += 4; |
|
2602 src += 4; |
|
2603 } |
|
2604 |
|
2605 CHECKPOINT (); |
|
2606 |
|
2607 while (w) |
|
2608 { |
|
2609 __m64 vsrc = load8888 (src); |
|
2610 uint64_t d = *dst; |
|
2611 __m64 vdest = expand565 (to_m64 (d), 0); |
|
2612 |
|
2613 vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
|
2614 |
|
2615 *dst = to_uint64 (vdest); |
|
2616 |
|
2617 w--; |
|
2618 dst++; |
|
2619 src++; |
|
2620 } |
|
2621 } |
|
2622 |
|
2623 _mm_empty (); |
|
2624 } |
|
2625 |
|
2626 static void |
|
2627 mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
|
2628 pixman_composite_info_t *info) |
|
2629 { |
|
2630 PIXMAN_COMPOSITE_ARGS (info); |
|
2631 uint32_t *dst_line, *dst; |
|
2632 uint32_t *src_line, *src; |
|
2633 int dst_stride, src_stride; |
|
2634 int32_t w; |
|
2635 |
|
2636 CHECKPOINT (); |
|
2637 |
|
2638 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
2639 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
2640 |
|
2641 #if 0 |
|
2642 /* FIXME */ |
|
2643 assert (src_image->drawable == mask_image->drawable); |
|
2644 #endif |
|
2645 |
|
2646 while (height--) |
|
2647 { |
|
2648 dst = dst_line; |
|
2649 dst_line += dst_stride; |
|
2650 src = src_line; |
|
2651 src_line += src_stride; |
|
2652 w = width; |
|
2653 |
|
2654 while (w && (uintptr_t)dst & 7) |
|
2655 { |
|
2656 __m64 s = load8888 (src); |
|
2657 __m64 d = load8888 (dst); |
|
2658 |
|
2659 store8888 (dst, over_rev_non_pre (s, d)); |
|
2660 |
|
2661 w--; |
|
2662 dst++; |
|
2663 src++; |
|
2664 } |
|
2665 |
|
2666 while (w >= 2) |
|
2667 { |
|
2668 uint32_t s0, s1; |
|
2669 unsigned char a0, a1; |
|
2670 __m64 d0, d1; |
|
2671 |
|
2672 s0 = *src; |
|
2673 s1 = *(src + 1); |
|
2674 |
|
2675 a0 = (s0 >> 24); |
|
2676 a1 = (s1 >> 24); |
|
2677 |
|
2678 if ((a0 & a1) == 0xFF) |
|
2679 { |
|
2680 d0 = invert_colors (load8888 (&s0)); |
|
2681 d1 = invert_colors (load8888 (&s1)); |
|
2682 |
|
2683 *(__m64 *)dst = pack8888 (d0, d1); |
|
2684 } |
|
2685 else if (s0 | s1) |
|
2686 { |
|
2687 __m64 vdest = *(__m64 *)dst; |
|
2688 |
|
2689 d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); |
|
2690 d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); |
|
2691 |
|
2692 *(__m64 *)dst = pack8888 (d0, d1); |
|
2693 } |
|
2694 |
|
2695 w -= 2; |
|
2696 dst += 2; |
|
2697 src += 2; |
|
2698 } |
|
2699 |
|
2700 if (w) |
|
2701 { |
|
2702 __m64 s = load8888 (src); |
|
2703 __m64 d = load8888 (dst); |
|
2704 |
|
2705 store8888 (dst, over_rev_non_pre (s, d)); |
|
2706 } |
|
2707 } |
|
2708 |
|
2709 _mm_empty (); |
|
2710 } |
|
2711 |
|
2712 static void |
|
2713 mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
|
2714 pixman_composite_info_t *info) |
|
2715 { |
|
2716 PIXMAN_COMPOSITE_ARGS (info); |
|
2717 uint32_t src; |
|
2718 uint16_t *dst_line; |
|
2719 uint32_t *mask_line; |
|
2720 int dst_stride, mask_stride; |
|
2721 __m64 vsrc, vsrca; |
|
2722 |
|
2723 CHECKPOINT (); |
|
2724 |
|
2725 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2726 |
|
2727 if (src == 0) |
|
2728 return; |
|
2729 |
|
2730 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
2731 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
|
2732 |
|
2733 vsrc = load8888 (&src); |
|
2734 vsrca = expand_alpha (vsrc); |
|
2735 |
|
2736 while (height--) |
|
2737 { |
|
2738 int twidth = width; |
|
2739 uint32_t *p = (uint32_t *)mask_line; |
|
2740 uint16_t *q = (uint16_t *)dst_line; |
|
2741 |
|
2742 while (twidth && ((uintptr_t)q & 7)) |
|
2743 { |
|
2744 uint32_t m = *(uint32_t *)p; |
|
2745 |
|
2746 if (m) |
|
2747 { |
|
2748 uint64_t d = *q; |
|
2749 __m64 vdest = expand565 (to_m64 (d), 0); |
|
2750 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
|
2751 *q = to_uint64 (vdest); |
|
2752 } |
|
2753 |
|
2754 twidth--; |
|
2755 p++; |
|
2756 q++; |
|
2757 } |
|
2758 |
|
2759 while (twidth >= 4) |
|
2760 { |
|
2761 uint32_t m0, m1, m2, m3; |
|
2762 |
|
2763 m0 = *p; |
|
2764 m1 = *(p + 1); |
|
2765 m2 = *(p + 2); |
|
2766 m3 = *(p + 3); |
|
2767 |
|
2768 if ((m0 | m1 | m2 | m3)) |
|
2769 { |
|
2770 __m64 vdest = *(__m64 *)q; |
|
2771 __m64 v0, v1, v2, v3; |
|
2772 |
|
2773 expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
|
2774 |
|
2775 v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); |
|
2776 v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); |
|
2777 v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); |
|
2778 v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); |
|
2779 |
|
2780 *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); |
|
2781 } |
|
2782 twidth -= 4; |
|
2783 p += 4; |
|
2784 q += 4; |
|
2785 } |
|
2786 |
|
2787 while (twidth) |
|
2788 { |
|
2789 uint32_t m; |
|
2790 |
|
2791 m = *(uint32_t *)p; |
|
2792 if (m) |
|
2793 { |
|
2794 uint64_t d = *q; |
|
2795 __m64 vdest = expand565 (to_m64 (d), 0); |
|
2796 vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
|
2797 *q = to_uint64 (vdest); |
|
2798 } |
|
2799 |
|
2800 twidth--; |
|
2801 p++; |
|
2802 q++; |
|
2803 } |
|
2804 |
|
2805 mask_line += mask_stride; |
|
2806 dst_line += dst_stride; |
|
2807 } |
|
2808 |
|
2809 _mm_empty (); |
|
2810 } |
|
2811 |
|
2812 static void |
|
2813 mmx_composite_in_n_8_8 (pixman_implementation_t *imp, |
|
2814 pixman_composite_info_t *info) |
|
2815 { |
|
2816 PIXMAN_COMPOSITE_ARGS (info); |
|
2817 uint8_t *dst_line, *dst; |
|
2818 uint8_t *mask_line, *mask; |
|
2819 int dst_stride, mask_stride; |
|
2820 int32_t w; |
|
2821 uint32_t src; |
|
2822 uint8_t sa; |
|
2823 __m64 vsrc, vsrca; |
|
2824 |
|
2825 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
2826 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
2827 |
|
2828 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2829 |
|
2830 sa = src >> 24; |
|
2831 |
|
2832 vsrc = load8888 (&src); |
|
2833 vsrca = expand_alpha (vsrc); |
|
2834 |
|
2835 while (height--) |
|
2836 { |
|
2837 dst = dst_line; |
|
2838 dst_line += dst_stride; |
|
2839 mask = mask_line; |
|
2840 mask_line += mask_stride; |
|
2841 w = width; |
|
2842 |
|
2843 while (w && (uintptr_t)dst & 7) |
|
2844 { |
|
2845 uint16_t tmp; |
|
2846 uint8_t a; |
|
2847 uint32_t m, d; |
|
2848 |
|
2849 a = *mask++; |
|
2850 d = *dst; |
|
2851 |
|
2852 m = MUL_UN8 (sa, a, tmp); |
|
2853 d = MUL_UN8 (m, d, tmp); |
|
2854 |
|
2855 *dst++ = d; |
|
2856 w--; |
|
2857 } |
|
2858 |
|
2859 while (w >= 4) |
|
2860 { |
|
2861 __m64 vmask; |
|
2862 __m64 vdest; |
|
2863 |
|
2864 vmask = load8888u ((uint32_t *)mask); |
|
2865 vdest = load8888 ((uint32_t *)dst); |
|
2866 |
|
2867 store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); |
|
2868 |
|
2869 dst += 4; |
|
2870 mask += 4; |
|
2871 w -= 4; |
|
2872 } |
|
2873 |
|
2874 while (w--) |
|
2875 { |
|
2876 uint16_t tmp; |
|
2877 uint8_t a; |
|
2878 uint32_t m, d; |
|
2879 |
|
2880 a = *mask++; |
|
2881 d = *dst; |
|
2882 |
|
2883 m = MUL_UN8 (sa, a, tmp); |
|
2884 d = MUL_UN8 (m, d, tmp); |
|
2885 |
|
2886 *dst++ = d; |
|
2887 } |
|
2888 } |
|
2889 |
|
2890 _mm_empty (); |
|
2891 } |
|
2892 |
|
2893 static void |
|
2894 mmx_composite_in_8_8 (pixman_implementation_t *imp, |
|
2895 pixman_composite_info_t *info) |
|
2896 { |
|
2897 PIXMAN_COMPOSITE_ARGS (info); |
|
2898 uint8_t *dst_line, *dst; |
|
2899 uint8_t *src_line, *src; |
|
2900 int src_stride, dst_stride; |
|
2901 int32_t w; |
|
2902 |
|
2903 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
2904 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
|
2905 |
|
2906 while (height--) |
|
2907 { |
|
2908 dst = dst_line; |
|
2909 dst_line += dst_stride; |
|
2910 src = src_line; |
|
2911 src_line += src_stride; |
|
2912 w = width; |
|
2913 |
|
2914 while (w && (uintptr_t)dst & 3) |
|
2915 { |
|
2916 uint8_t s, d; |
|
2917 uint16_t tmp; |
|
2918 |
|
2919 s = *src; |
|
2920 d = *dst; |
|
2921 |
|
2922 *dst = MUL_UN8 (s, d, tmp); |
|
2923 |
|
2924 src++; |
|
2925 dst++; |
|
2926 w--; |
|
2927 } |
|
2928 |
|
2929 while (w >= 4) |
|
2930 { |
|
2931 uint32_t *s = (uint32_t *)src; |
|
2932 uint32_t *d = (uint32_t *)dst; |
|
2933 |
|
2934 store8888 (d, in (load8888u (s), load8888 (d))); |
|
2935 |
|
2936 w -= 4; |
|
2937 dst += 4; |
|
2938 src += 4; |
|
2939 } |
|
2940 |
|
2941 while (w--) |
|
2942 { |
|
2943 uint8_t s, d; |
|
2944 uint16_t tmp; |
|
2945 |
|
2946 s = *src; |
|
2947 d = *dst; |
|
2948 |
|
2949 *dst = MUL_UN8 (s, d, tmp); |
|
2950 |
|
2951 src++; |
|
2952 dst++; |
|
2953 } |
|
2954 } |
|
2955 |
|
2956 _mm_empty (); |
|
2957 } |
|
2958 |
|
2959 static void |
|
2960 mmx_composite_add_n_8_8 (pixman_implementation_t *imp, |
|
2961 pixman_composite_info_t *info) |
|
2962 { |
|
2963 PIXMAN_COMPOSITE_ARGS (info); |
|
2964 uint8_t *dst_line, *dst; |
|
2965 uint8_t *mask_line, *mask; |
|
2966 int dst_stride, mask_stride; |
|
2967 int32_t w; |
|
2968 uint32_t src; |
|
2969 uint8_t sa; |
|
2970 __m64 vsrc, vsrca; |
|
2971 |
|
2972 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
2973 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
2974 |
|
2975 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
2976 |
|
2977 sa = src >> 24; |
|
2978 |
|
2979 if (src == 0) |
|
2980 return; |
|
2981 |
|
2982 vsrc = load8888 (&src); |
|
2983 vsrca = expand_alpha (vsrc); |
|
2984 |
|
2985 while (height--) |
|
2986 { |
|
2987 dst = dst_line; |
|
2988 dst_line += dst_stride; |
|
2989 mask = mask_line; |
|
2990 mask_line += mask_stride; |
|
2991 w = width; |
|
2992 |
|
2993 while (w && (uintptr_t)dst & 3) |
|
2994 { |
|
2995 uint16_t tmp; |
|
2996 uint16_t a; |
|
2997 uint32_t m, d; |
|
2998 uint32_t r; |
|
2999 |
|
3000 a = *mask++; |
|
3001 d = *dst; |
|
3002 |
|
3003 m = MUL_UN8 (sa, a, tmp); |
|
3004 r = ADD_UN8 (m, d, tmp); |
|
3005 |
|
3006 *dst++ = r; |
|
3007 w--; |
|
3008 } |
|
3009 |
|
3010 while (w >= 4) |
|
3011 { |
|
3012 __m64 vmask; |
|
3013 __m64 vdest; |
|
3014 |
|
3015 vmask = load8888u ((uint32_t *)mask); |
|
3016 vdest = load8888 ((uint32_t *)dst); |
|
3017 |
|
3018 store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); |
|
3019 |
|
3020 dst += 4; |
|
3021 mask += 4; |
|
3022 w -= 4; |
|
3023 } |
|
3024 |
|
3025 while (w--) |
|
3026 { |
|
3027 uint16_t tmp; |
|
3028 uint16_t a; |
|
3029 uint32_t m, d; |
|
3030 uint32_t r; |
|
3031 |
|
3032 a = *mask++; |
|
3033 d = *dst; |
|
3034 |
|
3035 m = MUL_UN8 (sa, a, tmp); |
|
3036 r = ADD_UN8 (m, d, tmp); |
|
3037 |
|
3038 *dst++ = r; |
|
3039 } |
|
3040 } |
|
3041 |
|
3042 _mm_empty (); |
|
3043 } |
|
3044 |
|
3045 static void |
|
3046 mmx_composite_add_8_8 (pixman_implementation_t *imp, |
|
3047 pixman_composite_info_t *info) |
|
3048 { |
|
3049 PIXMAN_COMPOSITE_ARGS (info); |
|
3050 uint8_t *dst_line, *dst; |
|
3051 uint8_t *src_line, *src; |
|
3052 int dst_stride, src_stride; |
|
3053 int32_t w; |
|
3054 uint8_t s, d; |
|
3055 uint16_t t; |
|
3056 |
|
3057 CHECKPOINT (); |
|
3058 |
|
3059 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
|
3060 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
|
3061 |
|
3062 while (height--) |
|
3063 { |
|
3064 dst = dst_line; |
|
3065 dst_line += dst_stride; |
|
3066 src = src_line; |
|
3067 src_line += src_stride; |
|
3068 w = width; |
|
3069 |
|
3070 while (w && (uintptr_t)dst & 7) |
|
3071 { |
|
3072 s = *src; |
|
3073 d = *dst; |
|
3074 t = d + s; |
|
3075 s = t | (0 - (t >> 8)); |
|
3076 *dst = s; |
|
3077 |
|
3078 dst++; |
|
3079 src++; |
|
3080 w--; |
|
3081 } |
|
3082 |
|
3083 while (w >= 8) |
|
3084 { |
|
3085 *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
|
3086 dst += 8; |
|
3087 src += 8; |
|
3088 w -= 8; |
|
3089 } |
|
3090 |
|
3091 while (w) |
|
3092 { |
|
3093 s = *src; |
|
3094 d = *dst; |
|
3095 t = d + s; |
|
3096 s = t | (0 - (t >> 8)); |
|
3097 *dst = s; |
|
3098 |
|
3099 dst++; |
|
3100 src++; |
|
3101 w--; |
|
3102 } |
|
3103 } |
|
3104 |
|
3105 _mm_empty (); |
|
3106 } |
|
3107 |
|
3108 static void |
|
3109 mmx_composite_add_0565_0565 (pixman_implementation_t *imp, |
|
3110 pixman_composite_info_t *info) |
|
3111 { |
|
3112 PIXMAN_COMPOSITE_ARGS (info); |
|
3113 uint16_t *dst_line, *dst; |
|
3114 uint32_t d; |
|
3115 uint16_t *src_line, *src; |
|
3116 uint32_t s; |
|
3117 int dst_stride, src_stride; |
|
3118 int32_t w; |
|
3119 |
|
3120 CHECKPOINT (); |
|
3121 |
|
3122 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); |
|
3123 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
|
3124 |
|
3125 while (height--) |
|
3126 { |
|
3127 dst = dst_line; |
|
3128 dst_line += dst_stride; |
|
3129 src = src_line; |
|
3130 src_line += src_stride; |
|
3131 w = width; |
|
3132 |
|
3133 while (w && (uintptr_t)dst & 7) |
|
3134 { |
|
3135 s = *src++; |
|
3136 if (s) |
|
3137 { |
|
3138 d = *dst; |
|
3139 s = convert_0565_to_8888 (s); |
|
3140 if (d) |
|
3141 { |
|
3142 d = convert_0565_to_8888 (d); |
|
3143 UN8x4_ADD_UN8x4 (s, d); |
|
3144 } |
|
3145 *dst = convert_8888_to_0565 (s); |
|
3146 } |
|
3147 dst++; |
|
3148 w--; |
|
3149 } |
|
3150 |
|
3151 while (w >= 4) |
|
3152 { |
|
3153 __m64 vdest = *(__m64 *)dst; |
|
3154 __m64 vsrc = ldq_u ((__m64 *)src); |
|
3155 __m64 vd0, vd1; |
|
3156 __m64 vs0, vs1; |
|
3157 |
|
3158 expand_4xpacked565 (vdest, &vd0, &vd1, 0); |
|
3159 expand_4xpacked565 (vsrc, &vs0, &vs1, 0); |
|
3160 |
|
3161 vd0 = _mm_adds_pu8 (vd0, vs0); |
|
3162 vd1 = _mm_adds_pu8 (vd1, vs1); |
|
3163 |
|
3164 *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); |
|
3165 |
|
3166 dst += 4; |
|
3167 src += 4; |
|
3168 w -= 4; |
|
3169 } |
|
3170 |
|
3171 while (w--) |
|
3172 { |
|
3173 s = *src++; |
|
3174 if (s) |
|
3175 { |
|
3176 d = *dst; |
|
3177 s = convert_0565_to_8888 (s); |
|
3178 if (d) |
|
3179 { |
|
3180 d = convert_0565_to_8888 (d); |
|
3181 UN8x4_ADD_UN8x4 (s, d); |
|
3182 } |
|
3183 *dst = convert_8888_to_0565 (s); |
|
3184 } |
|
3185 dst++; |
|
3186 } |
|
3187 } |
|
3188 |
|
3189 _mm_empty (); |
|
3190 } |
|
3191 |
|
3192 static void |
|
3193 mmx_composite_add_8888_8888 (pixman_implementation_t *imp, |
|
3194 pixman_composite_info_t *info) |
|
3195 { |
|
3196 PIXMAN_COMPOSITE_ARGS (info); |
|
3197 uint32_t *dst_line, *dst; |
|
3198 uint32_t *src_line, *src; |
|
3199 int dst_stride, src_stride; |
|
3200 int32_t w; |
|
3201 |
|
3202 CHECKPOINT (); |
|
3203 |
|
3204 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
3205 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3206 |
|
3207 while (height--) |
|
3208 { |
|
3209 dst = dst_line; |
|
3210 dst_line += dst_stride; |
|
3211 src = src_line; |
|
3212 src_line += src_stride; |
|
3213 w = width; |
|
3214 |
|
3215 while (w && (uintptr_t)dst & 7) |
|
3216 { |
|
3217 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
|
3218 load ((const uint32_t *)dst))); |
|
3219 dst++; |
|
3220 src++; |
|
3221 w--; |
|
3222 } |
|
3223 |
|
3224 while (w >= 2) |
|
3225 { |
|
3226 *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
|
3227 dst += 2; |
|
3228 src += 2; |
|
3229 w -= 2; |
|
3230 } |
|
3231 |
|
3232 if (w) |
|
3233 { |
|
3234 store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
|
3235 load ((const uint32_t *)dst))); |
|
3236 |
|
3237 } |
|
3238 } |
|
3239 |
|
3240 _mm_empty (); |
|
3241 } |
|
3242 |
|
3243 static pixman_bool_t |
|
3244 mmx_blt (pixman_implementation_t *imp, |
|
3245 uint32_t * src_bits, |
|
3246 uint32_t * dst_bits, |
|
3247 int src_stride, |
|
3248 int dst_stride, |
|
3249 int src_bpp, |
|
3250 int dst_bpp, |
|
3251 int src_x, |
|
3252 int src_y, |
|
3253 int dest_x, |
|
3254 int dest_y, |
|
3255 int width, |
|
3256 int height) |
|
3257 { |
|
3258 uint8_t * src_bytes; |
|
3259 uint8_t * dst_bytes; |
|
3260 int byte_width; |
|
3261 |
|
3262 if (src_bpp != dst_bpp) |
|
3263 return FALSE; |
|
3264 |
|
3265 if (src_bpp == 16) |
|
3266 { |
|
3267 src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
|
3268 dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
|
3269 src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
|
3270 dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
|
3271 byte_width = 2 * width; |
|
3272 src_stride *= 2; |
|
3273 dst_stride *= 2; |
|
3274 } |
|
3275 else if (src_bpp == 32) |
|
3276 { |
|
3277 src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
|
3278 dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
|
3279 src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
|
3280 dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
|
3281 byte_width = 4 * width; |
|
3282 src_stride *= 4; |
|
3283 dst_stride *= 4; |
|
3284 } |
|
3285 else |
|
3286 { |
|
3287 return FALSE; |
|
3288 } |
|
3289 |
|
3290 while (height--) |
|
3291 { |
|
3292 int w; |
|
3293 uint8_t *s = src_bytes; |
|
3294 uint8_t *d = dst_bytes; |
|
3295 src_bytes += src_stride; |
|
3296 dst_bytes += dst_stride; |
|
3297 w = byte_width; |
|
3298 |
|
3299 if (w >= 1 && ((uintptr_t)d & 1)) |
|
3300 { |
|
3301 *(uint8_t *)d = *(uint8_t *)s; |
|
3302 w -= 1; |
|
3303 s += 1; |
|
3304 d += 1; |
|
3305 } |
|
3306 |
|
3307 if (w >= 2 && ((uintptr_t)d & 3)) |
|
3308 { |
|
3309 *(uint16_t *)d = *(uint16_t *)s; |
|
3310 w -= 2; |
|
3311 s += 2; |
|
3312 d += 2; |
|
3313 } |
|
3314 |
|
3315 while (w >= 4 && ((uintptr_t)d & 7)) |
|
3316 { |
|
3317 *(uint32_t *)d = ldl_u ((uint32_t *)s); |
|
3318 |
|
3319 w -= 4; |
|
3320 s += 4; |
|
3321 d += 4; |
|
3322 } |
|
3323 |
|
3324 while (w >= 64) |
|
3325 { |
|
3326 #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX |
|
3327 __asm__ ( |
|
3328 "movq (%1), %%mm0\n" |
|
3329 "movq 8(%1), %%mm1\n" |
|
3330 "movq 16(%1), %%mm2\n" |
|
3331 "movq 24(%1), %%mm3\n" |
|
3332 "movq 32(%1), %%mm4\n" |
|
3333 "movq 40(%1), %%mm5\n" |
|
3334 "movq 48(%1), %%mm6\n" |
|
3335 "movq 56(%1), %%mm7\n" |
|
3336 |
|
3337 "movq %%mm0, (%0)\n" |
|
3338 "movq %%mm1, 8(%0)\n" |
|
3339 "movq %%mm2, 16(%0)\n" |
|
3340 "movq %%mm3, 24(%0)\n" |
|
3341 "movq %%mm4, 32(%0)\n" |
|
3342 "movq %%mm5, 40(%0)\n" |
|
3343 "movq %%mm6, 48(%0)\n" |
|
3344 "movq %%mm7, 56(%0)\n" |
|
3345 : |
|
3346 : "r" (d), "r" (s) |
|
3347 : "memory", |
|
3348 "%mm0", "%mm1", "%mm2", "%mm3", |
|
3349 "%mm4", "%mm5", "%mm6", "%mm7"); |
|
3350 #else |
|
3351 __m64 v0 = ldq_u ((__m64 *)(s + 0)); |
|
3352 __m64 v1 = ldq_u ((__m64 *)(s + 8)); |
|
3353 __m64 v2 = ldq_u ((__m64 *)(s + 16)); |
|
3354 __m64 v3 = ldq_u ((__m64 *)(s + 24)); |
|
3355 __m64 v4 = ldq_u ((__m64 *)(s + 32)); |
|
3356 __m64 v5 = ldq_u ((__m64 *)(s + 40)); |
|
3357 __m64 v6 = ldq_u ((__m64 *)(s + 48)); |
|
3358 __m64 v7 = ldq_u ((__m64 *)(s + 56)); |
|
3359 *(__m64 *)(d + 0) = v0; |
|
3360 *(__m64 *)(d + 8) = v1; |
|
3361 *(__m64 *)(d + 16) = v2; |
|
3362 *(__m64 *)(d + 24) = v3; |
|
3363 *(__m64 *)(d + 32) = v4; |
|
3364 *(__m64 *)(d + 40) = v5; |
|
3365 *(__m64 *)(d + 48) = v6; |
|
3366 *(__m64 *)(d + 56) = v7; |
|
3367 #endif |
|
3368 |
|
3369 w -= 64; |
|
3370 s += 64; |
|
3371 d += 64; |
|
3372 } |
|
3373 while (w >= 4) |
|
3374 { |
|
3375 *(uint32_t *)d = ldl_u ((uint32_t *)s); |
|
3376 |
|
3377 w -= 4; |
|
3378 s += 4; |
|
3379 d += 4; |
|
3380 } |
|
3381 if (w >= 2) |
|
3382 { |
|
3383 *(uint16_t *)d = *(uint16_t *)s; |
|
3384 w -= 2; |
|
3385 s += 2; |
|
3386 d += 2; |
|
3387 } |
|
3388 } |
|
3389 |
|
3390 _mm_empty (); |
|
3391 |
|
3392 return TRUE; |
|
3393 } |
|
3394 |
|
3395 static void |
|
3396 mmx_composite_copy_area (pixman_implementation_t *imp, |
|
3397 pixman_composite_info_t *info) |
|
3398 { |
|
3399 PIXMAN_COMPOSITE_ARGS (info); |
|
3400 |
|
3401 mmx_blt (imp, src_image->bits.bits, |
|
3402 dest_image->bits.bits, |
|
3403 src_image->bits.rowstride, |
|
3404 dest_image->bits.rowstride, |
|
3405 PIXMAN_FORMAT_BPP (src_image->bits.format), |
|
3406 PIXMAN_FORMAT_BPP (dest_image->bits.format), |
|
3407 src_x, src_y, dest_x, dest_y, width, height); |
|
3408 } |
|
3409 |
|
3410 static void |
|
3411 mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
|
3412 pixman_composite_info_t *info) |
|
3413 { |
|
3414 PIXMAN_COMPOSITE_ARGS (info); |
|
3415 uint32_t *src, *src_line; |
|
3416 uint32_t *dst, *dst_line; |
|
3417 uint8_t *mask, *mask_line; |
|
3418 int src_stride, mask_stride, dst_stride; |
|
3419 int32_t w; |
|
3420 |
|
3421 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3422 PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
|
3423 PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
|
3424 |
|
3425 while (height--) |
|
3426 { |
|
3427 src = src_line; |
|
3428 src_line += src_stride; |
|
3429 dst = dst_line; |
|
3430 dst_line += dst_stride; |
|
3431 mask = mask_line; |
|
3432 mask_line += mask_stride; |
|
3433 |
|
3434 w = width; |
|
3435 |
|
3436 while (w--) |
|
3437 { |
|
3438 uint64_t m = *mask; |
|
3439 |
|
3440 if (m) |
|
3441 { |
|
3442 uint32_t ssrc = *src | 0xff000000; |
|
3443 __m64 s = load8888 (&ssrc); |
|
3444 |
|
3445 if (m == 0xff) |
|
3446 { |
|
3447 store8888 (dst, s); |
|
3448 } |
|
3449 else |
|
3450 { |
|
3451 __m64 sa = expand_alpha (s); |
|
3452 __m64 vm = expand_alpha_rev (to_m64 (m)); |
|
3453 __m64 vdest = in_over (s, sa, vm, load8888 (dst)); |
|
3454 |
|
3455 store8888 (dst, vdest); |
|
3456 } |
|
3457 } |
|
3458 |
|
3459 mask++; |
|
3460 dst++; |
|
3461 src++; |
|
3462 } |
|
3463 } |
|
3464 |
|
3465 _mm_empty (); |
|
3466 } |
|
3467 |
|
3468 static void |
|
3469 mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
|
3470 pixman_composite_info_t *info) |
|
3471 { |
|
3472 PIXMAN_COMPOSITE_ARGS (info); |
|
3473 uint32_t src; |
|
3474 uint32_t *dst_line, *dst; |
|
3475 int32_t w; |
|
3476 int dst_stride; |
|
3477 __m64 vsrc; |
|
3478 |
|
3479 CHECKPOINT (); |
|
3480 |
|
3481 src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
|
3482 |
|
3483 if (src == 0) |
|
3484 return; |
|
3485 |
|
3486 PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
|
3487 |
|
3488 vsrc = load8888 (&src); |
|
3489 |
|
3490 while (height--) |
|
3491 { |
|
3492 dst = dst_line; |
|
3493 dst_line += dst_stride; |
|
3494 w = width; |
|
3495 |
|
3496 CHECKPOINT (); |
|
3497 |
|
3498 while (w && (uintptr_t)dst & 7) |
|
3499 { |
|
3500 __m64 vdest = load8888 (dst); |
|
3501 |
|
3502 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
|
3503 |
|
3504 w--; |
|
3505 dst++; |
|
3506 } |
|
3507 |
|
3508 while (w >= 2) |
|
3509 { |
|
3510 __m64 vdest = *(__m64 *)dst; |
|
3511 __m64 dest0 = expand8888 (vdest, 0); |
|
3512 __m64 dest1 = expand8888 (vdest, 1); |
|
3513 |
|
3514 |
|
3515 dest0 = over (dest0, expand_alpha (dest0), vsrc); |
|
3516 dest1 = over (dest1, expand_alpha (dest1), vsrc); |
|
3517 |
|
3518 *(__m64 *)dst = pack8888 (dest0, dest1); |
|
3519 |
|
3520 dst += 2; |
|
3521 w -= 2; |
|
3522 } |
|
3523 |
|
3524 CHECKPOINT (); |
|
3525 |
|
3526 if (w) |
|
3527 { |
|
3528 __m64 vdest = load8888 (dst); |
|
3529 |
|
3530 store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
|
3531 } |
|
3532 } |
|
3533 |
|
3534 _mm_empty (); |
|
3535 } |
|
3536 |
|
3537 #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) |
|
3538 #define BMSK (BSHIFT - 1) |
|
3539 |
|
3540 #define BILINEAR_DECLARE_VARIABLES \ |
|
3541 const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ |
|
3542 const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ |
|
3543 const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \ |
|
3544 const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ |
|
3545 const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ |
|
3546 const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ |
|
3547 const __m64 mm_zero = _mm_setzero_si64 (); \ |
|
3548 __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) |
|
3549 |
|
3550 #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ |
|
3551 do { \ |
|
3552 /* fetch 2x2 pixel block into 2 mmx registers */ \ |
|
3553 __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ |
|
3554 __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ |
|
3555 /* vertical interpolation */ \ |
|
3556 __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ |
|
3557 __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ |
|
3558 __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ |
|
3559 __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ |
|
3560 __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ |
|
3561 __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ |
|
3562 vx += unit_x; \ |
|
3563 if (BILINEAR_INTERPOLATION_BITS < 8) \ |
|
3564 { \ |
|
3565 /* calculate horizontal weights */ \ |
|
3566 __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ |
|
3567 _mm_srli_pi16 (mm_x, \ |
|
3568 16 - BILINEAR_INTERPOLATION_BITS))); \ |
|
3569 /* horizontal interpolation */ \ |
|
3570 __m64 p = _mm_unpacklo_pi16 (lo, hi); \ |
|
3571 __m64 q = _mm_unpackhi_pi16 (lo, hi); \ |
|
3572 lo = _mm_madd_pi16 (p, mm_wh); \ |
|
3573 hi = _mm_madd_pi16 (q, mm_wh); \ |
|
3574 } \ |
|
3575 else \ |
|
3576 { \ |
|
3577 /* calculate horizontal weights */ \ |
|
3578 __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ |
|
3579 16 - BILINEAR_INTERPOLATION_BITS)); \ |
|
3580 __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ |
|
3581 16 - BILINEAR_INTERPOLATION_BITS); \ |
|
3582 /* horizontal interpolation */ \ |
|
3583 __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ |
|
3584 __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ |
|
3585 __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ |
|
3586 __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ |
|
3587 lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ |
|
3588 _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ |
|
3589 hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ |
|
3590 _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ |
|
3591 } \ |
|
3592 mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
|
3593 /* shift and pack the result */ \ |
|
3594 hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ |
|
3595 lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ |
|
3596 lo = _mm_packs_pi32 (lo, hi); \ |
|
3597 lo = _mm_packs_pu16 (lo, lo); \ |
|
3598 pix = lo; \ |
|
3599 } while (0) |
|
3600 |
|
3601 #define BILINEAR_SKIP_ONE_PIXEL() \ |
|
3602 do { \ |
|
3603 vx += unit_x; \ |
|
3604 mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
|
3605 } while(0) |
|
3606 |
|
3607 static force_inline void |
|
3608 scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, |
|
3609 const uint32_t * mask, |
|
3610 const uint32_t * src_top, |
|
3611 const uint32_t * src_bottom, |
|
3612 int32_t w, |
|
3613 int wt, |
|
3614 int wb, |
|
3615 pixman_fixed_t vx, |
|
3616 pixman_fixed_t unit_x, |
|
3617 pixman_fixed_t max_vx, |
|
3618 pixman_bool_t zero_src) |
|
3619 { |
|
3620 BILINEAR_DECLARE_VARIABLES; |
|
3621 __m64 pix; |
|
3622 |
|
3623 while (w--) |
|
3624 { |
|
3625 BILINEAR_INTERPOLATE_ONE_PIXEL (pix); |
|
3626 store (dst, pix); |
|
3627 dst++; |
|
3628 } |
|
3629 |
|
3630 _mm_empty (); |
|
3631 } |
|
3632 |
|
3633 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, |
|
3634 scaled_bilinear_scanline_mmx_8888_8888_SRC, |
|
3635 uint32_t, uint32_t, uint32_t, |
|
3636 COVER, FLAG_NONE) |
|
3637 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, |
|
3638 scaled_bilinear_scanline_mmx_8888_8888_SRC, |
|
3639 uint32_t, uint32_t, uint32_t, |
|
3640 PAD, FLAG_NONE) |
|
3641 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, |
|
3642 scaled_bilinear_scanline_mmx_8888_8888_SRC, |
|
3643 uint32_t, uint32_t, uint32_t, |
|
3644 NONE, FLAG_NONE) |
|
3645 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, |
|
3646 scaled_bilinear_scanline_mmx_8888_8888_SRC, |
|
3647 uint32_t, uint32_t, uint32_t, |
|
3648 NORMAL, FLAG_NONE) |
|
3649 |
|
3650 static force_inline void |
|
3651 scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, |
|
3652 const uint32_t * mask, |
|
3653 const uint32_t * src_top, |
|
3654 const uint32_t * src_bottom, |
|
3655 int32_t w, |
|
3656 int wt, |
|
3657 int wb, |
|
3658 pixman_fixed_t vx, |
|
3659 pixman_fixed_t unit_x, |
|
3660 pixman_fixed_t max_vx, |
|
3661 pixman_bool_t zero_src) |
|
3662 { |
|
3663 BILINEAR_DECLARE_VARIABLES; |
|
3664 __m64 pix1, pix2; |
|
3665 |
|
3666 while (w) |
|
3667 { |
|
3668 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
3669 |
|
3670 if (!is_zero (pix1)) |
|
3671 { |
|
3672 pix2 = load (dst); |
|
3673 store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); |
|
3674 } |
|
3675 |
|
3676 w--; |
|
3677 dst++; |
|
3678 } |
|
3679 |
|
3680 _mm_empty (); |
|
3681 } |
|
3682 |
|
3683 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, |
|
3684 scaled_bilinear_scanline_mmx_8888_8888_OVER, |
|
3685 uint32_t, uint32_t, uint32_t, |
|
3686 COVER, FLAG_NONE) |
|
3687 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, |
|
3688 scaled_bilinear_scanline_mmx_8888_8888_OVER, |
|
3689 uint32_t, uint32_t, uint32_t, |
|
3690 PAD, FLAG_NONE) |
|
3691 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, |
|
3692 scaled_bilinear_scanline_mmx_8888_8888_OVER, |
|
3693 uint32_t, uint32_t, uint32_t, |
|
3694 NONE, FLAG_NONE) |
|
3695 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, |
|
3696 scaled_bilinear_scanline_mmx_8888_8888_OVER, |
|
3697 uint32_t, uint32_t, uint32_t, |
|
3698 NORMAL, FLAG_NONE) |
|
3699 |
|
3700 static force_inline void |
|
3701 scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, |
|
3702 const uint8_t * mask, |
|
3703 const uint32_t * src_top, |
|
3704 const uint32_t * src_bottom, |
|
3705 int32_t w, |
|
3706 int wt, |
|
3707 int wb, |
|
3708 pixman_fixed_t vx, |
|
3709 pixman_fixed_t unit_x, |
|
3710 pixman_fixed_t max_vx, |
|
3711 pixman_bool_t zero_src) |
|
3712 { |
|
3713 BILINEAR_DECLARE_VARIABLES; |
|
3714 __m64 pix1, pix2; |
|
3715 uint32_t m; |
|
3716 |
|
3717 while (w) |
|
3718 { |
|
3719 m = (uint32_t) *mask++; |
|
3720 |
|
3721 if (m) |
|
3722 { |
|
3723 BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
|
3724 |
|
3725 if (m == 0xff && is_opaque (pix1)) |
|
3726 { |
|
3727 store (dst, pix1); |
|
3728 } |
|
3729 else |
|
3730 { |
|
3731 __m64 ms, md, ma, msa; |
|
3732 |
|
3733 pix2 = load (dst); |
|
3734 ma = expand_alpha_rev (to_m64 (m)); |
|
3735 ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); |
|
3736 md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); |
|
3737 |
|
3738 msa = expand_alpha (ms); |
|
3739 |
|
3740 store8888 (dst, (in_over (ms, msa, ma, md))); |
|
3741 } |
|
3742 } |
|
3743 else |
|
3744 { |
|
3745 BILINEAR_SKIP_ONE_PIXEL (); |
|
3746 } |
|
3747 |
|
3748 w--; |
|
3749 dst++; |
|
3750 } |
|
3751 |
|
3752 _mm_empty (); |
|
3753 } |
|
3754 |
|
3755 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, |
|
3756 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
|
3757 uint32_t, uint8_t, uint32_t, |
|
3758 COVER, FLAG_HAVE_NON_SOLID_MASK) |
|
3759 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, |
|
3760 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
|
3761 uint32_t, uint8_t, uint32_t, |
|
3762 PAD, FLAG_HAVE_NON_SOLID_MASK) |
|
3763 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, |
|
3764 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
|
3765 uint32_t, uint8_t, uint32_t, |
|
3766 NONE, FLAG_HAVE_NON_SOLID_MASK) |
|
3767 FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, |
|
3768 scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
|
3769 uint32_t, uint8_t, uint32_t, |
|
3770 NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
|
3771 |
|
3772 static uint32_t * |
|
3773 mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
|
3774 { |
|
3775 int w = iter->width; |
|
3776 uint32_t *dst = iter->buffer; |
|
3777 uint32_t *src = (uint32_t *)iter->bits; |
|
3778 |
|
3779 iter->bits += iter->stride; |
|
3780 |
|
3781 while (w && ((uintptr_t)dst) & 7) |
|
3782 { |
|
3783 *dst++ = (*src++) | 0xff000000; |
|
3784 w--; |
|
3785 } |
|
3786 |
|
3787 while (w >= 8) |
|
3788 { |
|
3789 __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); |
|
3790 __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); |
|
3791 __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); |
|
3792 __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); |
|
3793 |
|
3794 *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); |
|
3795 *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); |
|
3796 *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); |
|
3797 *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); |
|
3798 |
|
3799 dst += 8; |
|
3800 src += 8; |
|
3801 w -= 8; |
|
3802 } |
|
3803 |
|
3804 while (w) |
|
3805 { |
|
3806 *dst++ = (*src++) | 0xff000000; |
|
3807 w--; |
|
3808 } |
|
3809 |
|
3810 _mm_empty (); |
|
3811 return iter->buffer; |
|
3812 } |
|
3813 |
|
3814 static uint32_t * |
|
3815 mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
|
3816 { |
|
3817 int w = iter->width; |
|
3818 uint32_t *dst = iter->buffer; |
|
3819 uint16_t *src = (uint16_t *)iter->bits; |
|
3820 |
|
3821 iter->bits += iter->stride; |
|
3822 |
|
3823 while (w && ((uintptr_t)dst) & 0x0f) |
|
3824 { |
|
3825 uint16_t s = *src++; |
|
3826 |
|
3827 *dst++ = convert_0565_to_8888 (s); |
|
3828 w--; |
|
3829 } |
|
3830 |
|
3831 while (w >= 4) |
|
3832 { |
|
3833 __m64 vsrc = ldq_u ((__m64 *)src); |
|
3834 __m64 mm0, mm1; |
|
3835 |
|
3836 expand_4xpacked565 (vsrc, &mm0, &mm1, 1); |
|
3837 |
|
3838 *(__m64 *)(dst + 0) = mm0; |
|
3839 *(__m64 *)(dst + 2) = mm1; |
|
3840 |
|
3841 dst += 4; |
|
3842 src += 4; |
|
3843 w -= 4; |
|
3844 } |
|
3845 |
|
3846 while (w) |
|
3847 { |
|
3848 uint16_t s = *src++; |
|
3849 |
|
3850 *dst++ = convert_0565_to_8888 (s); |
|
3851 w--; |
|
3852 } |
|
3853 |
|
3854 _mm_empty (); |
|
3855 return iter->buffer; |
|
3856 } |
|
3857 |
|
3858 static uint32_t * |
|
3859 mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
|
3860 { |
|
3861 int w = iter->width; |
|
3862 uint32_t *dst = iter->buffer; |
|
3863 uint8_t *src = iter->bits; |
|
3864 |
|
3865 iter->bits += iter->stride; |
|
3866 |
|
3867 while (w && (((uintptr_t)dst) & 15)) |
|
3868 { |
|
3869 *dst++ = *(src++) << 24; |
|
3870 w--; |
|
3871 } |
|
3872 |
|
3873 while (w >= 8) |
|
3874 { |
|
3875 __m64 mm0 = ldq_u ((__m64 *)src); |
|
3876 |
|
3877 __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); |
|
3878 __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); |
|
3879 __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); |
|
3880 __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); |
|
3881 __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); |
|
3882 __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); |
|
3883 |
|
3884 *(__m64 *)(dst + 0) = mm3; |
|
3885 *(__m64 *)(dst + 2) = mm4; |
|
3886 *(__m64 *)(dst + 4) = mm5; |
|
3887 *(__m64 *)(dst + 6) = mm6; |
|
3888 |
|
3889 dst += 8; |
|
3890 src += 8; |
|
3891 w -= 8; |
|
3892 } |
|
3893 |
|
3894 while (w) |
|
3895 { |
|
3896 *dst++ = *(src++) << 24; |
|
3897 w--; |
|
3898 } |
|
3899 |
|
3900 _mm_empty (); |
|
3901 return iter->buffer; |
|
3902 } |
|
3903 |
|
3904 typedef struct |
|
3905 { |
|
3906 pixman_format_code_t format; |
|
3907 pixman_iter_get_scanline_t get_scanline; |
|
3908 } fetcher_info_t; |
|
3909 |
|
3910 static const fetcher_info_t fetchers[] = |
|
3911 { |
|
3912 { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 }, |
|
3913 { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 }, |
|
3914 { PIXMAN_a8, mmx_fetch_a8 }, |
|
3915 { PIXMAN_null } |
|
3916 }; |
|
3917 |
|
3918 static pixman_bool_t |
|
3919 mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) |
|
3920 { |
|
3921 pixman_image_t *image = iter->image; |
|
3922 |
|
3923 #define FLAGS \ |
|
3924 (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
|
3925 FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
|
3926 |
|
3927 if ((iter->iter_flags & ITER_NARROW) && |
|
3928 (iter->image_flags & FLAGS) == FLAGS) |
|
3929 { |
|
3930 const fetcher_info_t *f; |
|
3931 |
|
3932 for (f = &fetchers[0]; f->format != PIXMAN_null; f++) |
|
3933 { |
|
3934 if (image->common.extended_format_code == f->format) |
|
3935 { |
|
3936 uint8_t *b = (uint8_t *)image->bits.bits; |
|
3937 int s = image->bits.rowstride * 4; |
|
3938 |
|
3939 iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; |
|
3940 iter->stride = s; |
|
3941 |
|
3942 iter->get_scanline = f->get_scanline; |
|
3943 return TRUE; |
|
3944 } |
|
3945 } |
|
3946 } |
|
3947 |
|
3948 return FALSE; |
|
3949 } |
|
3950 |
|
3951 static const pixman_fast_path_t mmx_fast_paths[] = |
|
3952 { |
|
3953 PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), |
|
3954 PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), |
|
3955 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), |
|
3956 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), |
|
3957 PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), |
|
3958 PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), |
|
3959 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
|
3960 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
|
3961 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), |
|
3962 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
|
3963 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
|
3964 PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), |
|
3965 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
|
3966 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
|
3967 PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), |
|
3968 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
|
3969 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
|
3970 PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), |
|
3971 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), |
|
3972 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), |
|
3973 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), |
|
3974 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), |
|
3975 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), |
|
3976 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), |
|
3977 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), |
|
3978 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), |
|
3979 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), |
|
3980 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), |
|
3981 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), |
|
3982 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), |
|
3983 PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), |
|
3984 PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), |
|
3985 PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), |
|
3986 PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), |
|
3987 PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
|
3988 PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
|
3989 |
|
3990 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), |
|
3991 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), |
|
3992 PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), |
|
3993 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), |
|
3994 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), |
|
3995 PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), |
|
3996 |
|
3997 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), |
|
3998 PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), |
|
3999 |
|
4000 PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), |
|
4001 PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), |
|
4002 PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), |
|
4003 PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), |
|
4004 PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), |
|
4005 PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), |
|
4006 |
|
4007 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
|
4008 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
|
4009 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
|
4010 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
|
4011 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), |
|
4012 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), |
|
4013 PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), |
|
4014 PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), |
|
4015 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), |
|
4016 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), |
|
4017 PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
|
4018 PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
|
4019 PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
|
4020 PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
|
4021 PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), |
|
4022 PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), |
|
4023 |
|
4024 PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), |
|
4025 PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), |
|
4026 |
|
4027 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
|
4028 SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
|
4029 SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
|
4030 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
|
4031 SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
|
4032 SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
|
4033 |
|
4034 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
|
4035 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
|
4036 SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
|
4037 SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
|
4038 |
|
4039 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), |
|
4040 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), |
|
4041 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), |
|
4042 SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), |
|
4043 |
|
4044 { PIXMAN_OP_NONE }, |
|
4045 }; |
|
4046 |
|
4047 pixman_implementation_t * |
|
4048 _pixman_implementation_create_mmx (pixman_implementation_t *fallback) |
|
4049 { |
|
4050 pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); |
|
4051 |
|
4052 imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; |
|
4053 imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; |
|
4054 imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; |
|
4055 imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; |
|
4056 imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; |
|
4057 imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; |
|
4058 imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; |
|
4059 imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; |
|
4060 imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; |
|
4061 imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; |
|
4062 imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; |
|
4063 |
|
4064 imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; |
|
4065 imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; |
|
4066 imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; |
|
4067 imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; |
|
4068 imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; |
|
4069 imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; |
|
4070 imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; |
|
4071 imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; |
|
4072 imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; |
|
4073 imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; |
|
4074 imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; |
|
4075 |
|
4076 imp->blt = mmx_blt; |
|
4077 imp->fill = mmx_fill; |
|
4078 |
|
4079 imp->src_iter_init = mmx_src_iter_init; |
|
4080 |
|
4081 return imp; |
|
4082 } |
|
4083 |
|
4084 #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */ |