gfx/cairo/libpixman/src/pixman-vmx.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

     1 /*
     2  * Copyright © 2007 Luca Barbato
     3  *
     4  * Permission to use, copy, modify, distribute, and sell this software and its
     5  * documentation for any purpose is hereby granted without fee, provided that
     6  * the above copyright notice appear in all copies and that both that
     7  * copyright notice and this permission notice appear in supporting
     8  * documentation, and that the name of Luca Barbato not be used in advertising or
     9  * publicity pertaining to distribution of the software without specific,
    10  * written prior permission.  Luca Barbato makes no representations about the
    11  * suitability of this software for any purpose.  It is provided "as is"
    12  * without express or implied warranty.
    13  *
    14  * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
    15  * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
    16  * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
    17  * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
    18  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
    19  * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
    20  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
    21  * SOFTWARE.
    22  *
    23  * Author:  Luca Barbato (lu_zero@gentoo.org)
    24  *
    25  * Based on fbmmx.c by Owen Taylor, Søren Sandmann and Nicholas Miell
    26  */
    28 #include <config.h>
    29 #include "pixman-private.h"
    30 #include "pixman-combine32.h"
    31 #include <altivec.h>
    33 #define AVV(x...) {x}
    35 static force_inline vector unsigned int
    36 splat_alpha (vector unsigned int pix)
    37 {
    38     return vec_perm (pix, pix,
    39 		     (vector unsigned char)AVV (
    40 			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
    41 			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
    42 }
    44 static force_inline vector unsigned int
    45 pix_multiply (vector unsigned int p, vector unsigned int a)
    46 {
    47     vector unsigned short hi, lo, mod;
    49     /* unpack to short */
    50     hi = (vector unsigned short)
    51 	vec_mergeh ((vector unsigned char)AVV (0),
    52 		    (vector unsigned char)p);
    54     mod = (vector unsigned short)
    55 	vec_mergeh ((vector unsigned char)AVV (0),
    56 		    (vector unsigned char)a);
    58     hi = vec_mladd (hi, mod, (vector unsigned short)
    59                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
    60                          0x0080, 0x0080, 0x0080, 0x0080));
    62     hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
    64     hi = vec_sr (hi, vec_splat_u16 (8));
    66     /* unpack to short */
    67     lo = (vector unsigned short)
    68 	vec_mergel ((vector unsigned char)AVV (0),
    69 		    (vector unsigned char)p);
    70     mod = (vector unsigned short)
    71 	vec_mergel ((vector unsigned char)AVV (0),
    72 		    (vector unsigned char)a);
    74     lo = vec_mladd (lo, mod, (vector unsigned short)
    75                     AVV (0x0080, 0x0080, 0x0080, 0x0080,
    76                          0x0080, 0x0080, 0x0080, 0x0080));
    78     lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
    80     lo = vec_sr (lo, vec_splat_u16 (8));
    82     return (vector unsigned int)vec_packsu (hi, lo);
    83 }
    85 static force_inline vector unsigned int
    86 pix_add (vector unsigned int a, vector unsigned int b)
    87 {
    88     return (vector unsigned int)vec_adds ((vector unsigned char)a,
    89                                           (vector unsigned char)b);
    90 }
    92 static force_inline vector unsigned int
    93 pix_add_mul (vector unsigned int x,
    94              vector unsigned int a,
    95              vector unsigned int y,
    96              vector unsigned int b)
    97 {
    98     vector unsigned int t1, t2;
   100     t1 = pix_multiply (x, a);
   101     t2 = pix_multiply (y, b);
   103     return pix_add (t1, t2);
   104 }
   106 static force_inline vector unsigned int
   107 negate (vector unsigned int src)
   108 {
   109     return vec_nor (src, src);
   110 }
   112 /* dest*~srca + src */
   113 static force_inline vector unsigned int
   114 over (vector unsigned int src,
   115       vector unsigned int srca,
   116       vector unsigned int dest)
   117 {
   118     vector unsigned char tmp = (vector unsigned char)
   119 	pix_multiply (dest, negate (srca));
   121     tmp = vec_adds ((vector unsigned char)src, tmp);
   122     return (vector unsigned int)tmp;
   123 }
   125 /* in == pix_multiply */
   126 #define in_over(src, srca, mask, dest)					\
   127     over (pix_multiply (src, mask),					\
   128           pix_multiply (srca, mask), dest)
   131 #define COMPUTE_SHIFT_MASK(source)					\
   132     source ## _mask = vec_lvsl (0, source);
   134 #define COMPUTE_SHIFT_MASKS(dest, source)				\
   135     dest ## _mask = vec_lvsl (0, dest);					\
   136     source ## _mask = vec_lvsl (0, source);				\
   137     store_mask = vec_lvsr (0, dest);
   139 #define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
   140     mask ## _mask = vec_lvsl (0, mask);					\
   141     dest ## _mask = vec_lvsl (0, dest);					\
   142     source ## _mask = vec_lvsl (0, source);				\
   143     store_mask = vec_lvsr (0, dest);
   145 /* notice you have to declare temp vars...
   146  * Note: tmp3 and tmp4 must remain untouched!
   147  */
   149 #define LOAD_VECTORS(dest, source)			  \
   150     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
   151     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
   152     tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
   153     v ## source = (typeof(v ## source))			  \
   154 	vec_perm (tmp1, tmp2, source ## _mask);		  \
   155     tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
   156     v ## dest = (typeof(v ## dest))			  \
   157 	vec_perm (tmp3, tmp4, dest ## _mask);
   159 #define LOAD_VECTORSC(dest, source, mask)		  \
   160     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
   161     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
   162     tmp3 = (typeof(tmp3))vec_ld (0, dest);		  \
   163     v ## source = (typeof(v ## source))			  \
   164 	vec_perm (tmp1, tmp2, source ## _mask);		  \
   165     tmp4 = (typeof(tmp4))vec_ld (15, dest);		  \
   166     tmp1 = (typeof(tmp1))vec_ld (0, mask);		  \
   167     v ## dest = (typeof(v ## dest))			  \
   168 	vec_perm (tmp3, tmp4, dest ## _mask);		  \
   169     tmp2 = (typeof(tmp2))vec_ld (15, mask);		  \
   170     v ## mask = (typeof(v ## mask))			  \
   171 	vec_perm (tmp1, tmp2, mask ## _mask);
   173 #define LOAD_VECTORSM(dest, source, mask)				\
   174     LOAD_VECTORSC (dest, source, mask)					\
   175     v ## source = pix_multiply (v ## source,				\
   176                                 splat_alpha (v ## mask));
   178 #define STORE_VECTOR(dest)						\
   179     edges = vec_perm (tmp4, tmp3, dest ## _mask);			\
   180     tmp3 = vec_perm ((vector unsigned char)v ## dest, edges, store_mask); \
   181     tmp1 = vec_perm (edges, (vector unsigned char)v ## dest, store_mask); \
   182     vec_st ((vector unsigned int) tmp3, 15, dest);			\
   183     vec_st ((vector unsigned int) tmp1, 0, dest);
   185 static void
   186 vmx_combine_over_u_no_mask (uint32_t *      dest,
   187                             const uint32_t *src,
   188                             int             width)
   189 {
   190     int i;
   191     vector unsigned int vdest, vsrc;
   192     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   193 	dest_mask, src_mask, store_mask;
   195     COMPUTE_SHIFT_MASKS (dest, src);
   197     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   198     for (i = width / 4; i > 0; i--)
   199     {
   201 	LOAD_VECTORS (dest, src);
   203 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
   205 	STORE_VECTOR (dest);
   207 	src += 4;
   208 	dest += 4;
   209     }
   211     for (i = width % 4; --i >= 0;)
   212     {
   213 	uint32_t s = src[i];
   214 	uint32_t d = dest[i];
   215 	uint32_t ia = ALPHA_8 (~s);
   217 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
   219 	dest[i] = d;
   220     }
   221 }
   223 static void
   224 vmx_combine_over_u_mask (uint32_t *      dest,
   225                          const uint32_t *src,
   226                          const uint32_t *mask,
   227                          int             width)
   228 {
   229     int i;
   230     vector unsigned int vdest, vsrc, vmask;
   231     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   232 	dest_mask, src_mask, mask_mask, store_mask;
   234     COMPUTE_SHIFT_MASKC (dest, src, mask);
   236     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   237     for (i = width / 4; i > 0; i--)
   238     {
   239 	LOAD_VECTORSM (dest, src, mask);
   241 	vdest = over (vsrc, splat_alpha (vsrc), vdest);
   243 	STORE_VECTOR (dest);
   245 	src += 4;
   246 	dest += 4;
   247 	mask += 4;
   248     }
   250     for (i = width % 4; --i >= 0;)
   251     {
   252 	uint32_t m = ALPHA_8 (mask[i]);
   253 	uint32_t s = src[i];
   254 	uint32_t d = dest[i];
   255 	uint32_t ia;
   257 	UN8x4_MUL_UN8 (s, m);
   259 	ia = ALPHA_8 (~s);
   261 	UN8x4_MUL_UN8_ADD_UN8x4 (d, ia, s);
   262 	dest[i] = d;
   263     }
   264 }
   266 static void
   267 vmx_combine_over_u (pixman_implementation_t *imp,
   268                     pixman_op_t              op,
   269                     uint32_t *               dest,
   270                     const uint32_t *         src,
   271                     const uint32_t *         mask,
   272                     int                      width)
   273 {
   274     if (mask)
   275 	vmx_combine_over_u_mask (dest, src, mask, width);
   276     else
   277 	vmx_combine_over_u_no_mask (dest, src, width);
   278 }
   280 static void
   281 vmx_combine_over_reverse_u_no_mask (uint32_t *      dest,
   282                                     const uint32_t *src,
   283                                     int             width)
   284 {
   285     int i;
   286     vector unsigned int vdest, vsrc;
   287     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   288 	dest_mask, src_mask, store_mask;
   290     COMPUTE_SHIFT_MASKS (dest, src);
   292     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   293     for (i = width / 4; i > 0; i--)
   294     {
   296 	LOAD_VECTORS (dest, src);
   298 	vdest = over (vdest, splat_alpha (vdest), vsrc);
   300 	STORE_VECTOR (dest);
   302 	src += 4;
   303 	dest += 4;
   304     }
   306     for (i = width % 4; --i >= 0;)
   307     {
   308 	uint32_t s = src[i];
   309 	uint32_t d = dest[i];
   310 	uint32_t ia = ALPHA_8 (~dest[i]);
   312 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
   313 	dest[i] = s;
   314     }
   315 }
   317 static void
   318 vmx_combine_over_reverse_u_mask (uint32_t *      dest,
   319                                  const uint32_t *src,
   320                                  const uint32_t *mask,
   321                                  int             width)
   322 {
   323     int i;
   324     vector unsigned int vdest, vsrc, vmask;
   325     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   326 	dest_mask, src_mask, mask_mask, store_mask;
   328     COMPUTE_SHIFT_MASKC (dest, src, mask);
   330     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   331     for (i = width / 4; i > 0; i--)
   332     {
   334 	LOAD_VECTORSM (dest, src, mask);
   336 	vdest = over (vdest, splat_alpha (vdest), vsrc);
   338 	STORE_VECTOR (dest);
   340 	src += 4;
   341 	dest += 4;
   342 	mask += 4;
   343     }
   345     for (i = width % 4; --i >= 0;)
   346     {
   347 	uint32_t m = ALPHA_8 (mask[i]);
   348 	uint32_t s = src[i];
   349 	uint32_t d = dest[i];
   350 	uint32_t ia = ALPHA_8 (~dest[i]);
   352 	UN8x4_MUL_UN8 (s, m);
   354 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ia, d);
   355 	dest[i] = s;
   356     }
   357 }
   359 static void
   360 vmx_combine_over_reverse_u (pixman_implementation_t *imp,
   361                             pixman_op_t              op,
   362                             uint32_t *               dest,
   363                             const uint32_t *         src,
   364                             const uint32_t *         mask,
   365                             int                      width)
   366 {
   367     if (mask)
   368 	vmx_combine_over_reverse_u_mask (dest, src, mask, width);
   369     else
   370 	vmx_combine_over_reverse_u_no_mask (dest, src, width);
   371 }
   373 static void
   374 vmx_combine_in_u_no_mask (uint32_t *      dest,
   375                           const uint32_t *src,
   376                           int             width)
   377 {
   378     int i;
   379     vector unsigned int vdest, vsrc;
   380     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   381 	dest_mask, src_mask, store_mask;
   383     COMPUTE_SHIFT_MASKS (dest, src);
   385     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   386     for (i = width / 4; i > 0; i--)
   387     {
   388 	LOAD_VECTORS (dest, src);
   390 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
   392 	STORE_VECTOR (dest);
   394 	src += 4;
   395 	dest += 4;
   396     }
   398     for (i = width % 4; --i >= 0;)
   399     {
   400 	uint32_t s = src[i];
   401 	uint32_t a = ALPHA_8 (dest[i]);
   403 	UN8x4_MUL_UN8 (s, a);
   404 	dest[i] = s;
   405     }
   406 }
   408 static void
   409 vmx_combine_in_u_mask (uint32_t *      dest,
   410                        const uint32_t *src,
   411                        const uint32_t *mask,
   412                        int             width)
   413 {
   414     int i;
   415     vector unsigned int vdest, vsrc, vmask;
   416     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   417 	dest_mask, src_mask, mask_mask, store_mask;
   419     COMPUTE_SHIFT_MASKC (dest, src, mask);
   421     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   422     for (i = width / 4; i > 0; i--)
   423     {
   424 	LOAD_VECTORSM (dest, src, mask);
   426 	vdest = pix_multiply (vsrc, splat_alpha (vdest));
   428 	STORE_VECTOR (dest);
   430 	src += 4;
   431 	dest += 4;
   432 	mask += 4;
   433     }
   435     for (i = width % 4; --i >= 0;)
   436     {
   437 	uint32_t m = ALPHA_8 (mask[i]);
   438 	uint32_t s = src[i];
   439 	uint32_t a = ALPHA_8 (dest[i]);
   441 	UN8x4_MUL_UN8 (s, m);
   442 	UN8x4_MUL_UN8 (s, a);
   444 	dest[i] = s;
   445     }
   446 }
   448 static void
   449 vmx_combine_in_u (pixman_implementation_t *imp,
   450                   pixman_op_t              op,
   451                   uint32_t *               dest,
   452                   const uint32_t *         src,
   453                   const uint32_t *         mask,
   454                   int                      width)
   455 {
   456     if (mask)
   457 	vmx_combine_in_u_mask (dest, src, mask, width);
   458     else
   459 	vmx_combine_in_u_no_mask (dest, src, width);
   460 }
   462 static void
   463 vmx_combine_in_reverse_u_no_mask (uint32_t *      dest,
   464                                   const uint32_t *src,
   465                                   int             width)
   466 {
   467     int i;
   468     vector unsigned int vdest, vsrc;
   469     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   470 	dest_mask, src_mask, store_mask;
   472     COMPUTE_SHIFT_MASKS (dest, src);
   474     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   475     for (i = width / 4; i > 0; i--)
   476     {
   477 	LOAD_VECTORS (dest, src);
   479 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
   481 	STORE_VECTOR (dest);
   483 	src += 4;
   484 	dest += 4;
   485     }
   487     for (i = width % 4; --i >= 0;)
   488     {
   489 	uint32_t d = dest[i];
   490 	uint32_t a = ALPHA_8 (src[i]);
   492 	UN8x4_MUL_UN8 (d, a);
   494 	dest[i] = d;
   495     }
   496 }
   498 static void
   499 vmx_combine_in_reverse_u_mask (uint32_t *      dest,
   500                                const uint32_t *src,
   501                                const uint32_t *mask,
   502                                int             width)
   503 {
   504     int i;
   505     vector unsigned int vdest, vsrc, vmask;
   506     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   507 	dest_mask, src_mask, mask_mask, store_mask;
   509     COMPUTE_SHIFT_MASKC (dest, src, mask);
   511     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   512     for (i = width / 4; i > 0; i--)
   513     {
   514 	LOAD_VECTORSM (dest, src, mask);
   516 	vdest = pix_multiply (vdest, splat_alpha (vsrc));
   518 	STORE_VECTOR (dest);
   520 	src += 4;
   521 	dest += 4;
   522 	mask += 4;
   523     }
   525     for (i = width % 4; --i >= 0;)
   526     {
   527 	uint32_t m = ALPHA_8 (mask[i]);
   528 	uint32_t d = dest[i];
   529 	uint32_t a = src[i];
   531 	UN8x4_MUL_UN8 (a, m);
   532 	a = ALPHA_8 (a);
   533 	UN8x4_MUL_UN8 (d, a);
   535 	dest[i] = d;
   536     }
   537 }
   539 static void
   540 vmx_combine_in_reverse_u (pixman_implementation_t *imp,
   541                           pixman_op_t              op,
   542                           uint32_t *               dest,
   543                           const uint32_t *         src,
   544                           const uint32_t *         mask,
   545                           int                      width)
   546 {
   547     if (mask)
   548 	vmx_combine_in_reverse_u_mask (dest, src, mask, width);
   549     else
   550 	vmx_combine_in_reverse_u_no_mask (dest, src, width);
   551 }
   553 static void
   554 vmx_combine_out_u_no_mask (uint32_t *      dest,
   555                            const uint32_t *src,
   556                            int             width)
   557 {
   558     int i;
   559     vector unsigned int vdest, vsrc;
   560     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   561 	dest_mask, src_mask, store_mask;
   563     COMPUTE_SHIFT_MASKS (dest, src);
   565     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   566     for (i = width / 4; i > 0; i--)
   567     {
   568 	LOAD_VECTORS (dest, src);
   570 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
   572 	STORE_VECTOR (dest);
   574 	src += 4;
   575 	dest += 4;
   576     }
   578     for (i = width % 4; --i >= 0;)
   579     {
   580 	uint32_t s = src[i];
   581 	uint32_t a = ALPHA_8 (~dest[i]);
   583 	UN8x4_MUL_UN8 (s, a);
   585 	dest[i] = s;
   586     }
   587 }
   589 static void
   590 vmx_combine_out_u_mask (uint32_t *      dest,
   591                         const uint32_t *src,
   592                         const uint32_t *mask,
   593                         int             width)
   594 {
   595     int i;
   596     vector unsigned int vdest, vsrc, vmask;
   597     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   598 	dest_mask, src_mask, mask_mask, store_mask;
   600     COMPUTE_SHIFT_MASKC (dest, src, mask);
   602     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   603     for (i = width / 4; i > 0; i--)
   604     {
   605 	LOAD_VECTORSM (dest, src, mask);
   607 	vdest = pix_multiply (vsrc, splat_alpha (negate (vdest)));
   609 	STORE_VECTOR (dest);
   611 	src += 4;
   612 	dest += 4;
   613 	mask += 4;
   614     }
   616     for (i = width % 4; --i >= 0;)
   617     {
   618 	uint32_t m = ALPHA_8 (mask[i]);
   619 	uint32_t s = src[i];
   620 	uint32_t a = ALPHA_8 (~dest[i]);
   622 	UN8x4_MUL_UN8 (s, m);
   623 	UN8x4_MUL_UN8 (s, a);
   625 	dest[i] = s;
   626     }
   627 }
   629 static void
   630 vmx_combine_out_u (pixman_implementation_t *imp,
   631                    pixman_op_t              op,
   632                    uint32_t *               dest,
   633                    const uint32_t *         src,
   634                    const uint32_t *         mask,
   635                    int                      width)
   636 {
   637     if (mask)
   638 	vmx_combine_out_u_mask (dest, src, mask, width);
   639     else
   640 	vmx_combine_out_u_no_mask (dest, src, width);
   641 }
   643 static void
   644 vmx_combine_out_reverse_u_no_mask (uint32_t *      dest,
   645                                    const uint32_t *src,
   646                                    int             width)
   647 {
   648     int i;
   649     vector unsigned int vdest, vsrc;
   650     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   651 	dest_mask, src_mask, store_mask;
   653     COMPUTE_SHIFT_MASKS (dest, src);
   655     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   656     for (i = width / 4; i > 0; i--)
   657     {
   659 	LOAD_VECTORS (dest, src);
   661 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
   663 	STORE_VECTOR (dest);
   665 	src += 4;
   666 	dest += 4;
   667     }
   669     for (i = width % 4; --i >= 0;)
   670     {
   671 	uint32_t d = dest[i];
   672 	uint32_t a = ALPHA_8 (~src[i]);
   674 	UN8x4_MUL_UN8 (d, a);
   676 	dest[i] = d;
   677     }
   678 }
   680 static void
   681 vmx_combine_out_reverse_u_mask (uint32_t *      dest,
   682                                 const uint32_t *src,
   683                                 const uint32_t *mask,
   684                                 int             width)
   685 {
   686     int i;
   687     vector unsigned int vdest, vsrc, vmask;
   688     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   689 	dest_mask, src_mask, mask_mask, store_mask;
   691     COMPUTE_SHIFT_MASKC (dest, src, mask);
   693     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   694     for (i = width / 4; i > 0; i--)
   695     {
   696 	LOAD_VECTORSM (dest, src, mask);
   698 	vdest = pix_multiply (vdest, splat_alpha (negate (vsrc)));
   700 	STORE_VECTOR (dest);
   702 	src += 4;
   703 	dest += 4;
   704 	mask += 4;
   705     }
   707     for (i = width % 4; --i >= 0;)
   708     {
   709 	uint32_t m = ALPHA_8 (mask[i]);
   710 	uint32_t d = dest[i];
   711 	uint32_t a = src[i];
   713 	UN8x4_MUL_UN8 (a, m);
   714 	a = ALPHA_8 (~a);
   715 	UN8x4_MUL_UN8 (d, a);
   717 	dest[i] = d;
   718     }
   719 }
   721 static void
   722 vmx_combine_out_reverse_u (pixman_implementation_t *imp,
   723                            pixman_op_t              op,
   724                            uint32_t *               dest,
   725                            const uint32_t *         src,
   726                            const uint32_t *         mask,
   727                            int                      width)
   728 {
   729     if (mask)
   730 	vmx_combine_out_reverse_u_mask (dest, src, mask, width);
   731     else
   732 	vmx_combine_out_reverse_u_no_mask (dest, src, width);
   733 }
   735 static void
   736 vmx_combine_atop_u_no_mask (uint32_t *      dest,
   737                             const uint32_t *src,
   738                             int             width)
   739 {
   740     int i;
   741     vector unsigned int vdest, vsrc;
   742     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   743 	dest_mask, src_mask, store_mask;
   745     COMPUTE_SHIFT_MASKS (dest, src);
   747     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   748     for (i = width / 4; i > 0; i--)
   749     {
   750 	LOAD_VECTORS (dest, src);
   752 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
   753 			     vdest, splat_alpha (negate (vsrc)));
   755 	STORE_VECTOR (dest);
   757 	src += 4;
   758 	dest += 4;
   759     }
   761     for (i = width % 4; --i >= 0;)
   762     {
   763 	uint32_t s = src[i];
   764 	uint32_t d = dest[i];
   765 	uint32_t dest_a = ALPHA_8 (d);
   766 	uint32_t src_ia = ALPHA_8 (~s);
   768 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
   770 	dest[i] = s;
   771     }
   772 }
   774 static void
   775 vmx_combine_atop_u_mask (uint32_t *      dest,
   776                          const uint32_t *src,
   777                          const uint32_t *mask,
   778                          int             width)
   779 {
   780     int i;
   781     vector unsigned int vdest, vsrc, vmask;
   782     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   783 	dest_mask, src_mask, mask_mask, store_mask;
   785     COMPUTE_SHIFT_MASKC (dest, src, mask);
   787     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   788     for (i = width / 4; i > 0; i--)
   789     {
   790 	LOAD_VECTORSM (dest, src, mask);
   792 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
   793 			     vdest, splat_alpha (negate (vsrc)));
   795 	STORE_VECTOR (dest);
   797 	src += 4;
   798 	dest += 4;
   799 	mask += 4;
   800     }
   802     for (i = width % 4; --i >= 0;)
   803     {
   804 	uint32_t m = ALPHA_8 (mask[i]);
   805 	uint32_t s = src[i];
   806 	uint32_t d = dest[i];
   807 	uint32_t dest_a = ALPHA_8 (d);
   808 	uint32_t src_ia;
   810 	UN8x4_MUL_UN8 (s, m);
   812 	src_ia = ALPHA_8 (~s);
   814 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_a, d, src_ia);
   816 	dest[i] = s;
   817     }
   818 }
   820 static void
   821 vmx_combine_atop_u (pixman_implementation_t *imp,
   822                     pixman_op_t              op,
   823                     uint32_t *               dest,
   824                     const uint32_t *         src,
   825                     const uint32_t *         mask,
   826                     int                      width)
   827 {
   828     if (mask)
   829 	vmx_combine_atop_u_mask (dest, src, mask, width);
   830     else
   831 	vmx_combine_atop_u_no_mask (dest, src, width);
   832 }
   834 static void
   835 vmx_combine_atop_reverse_u_no_mask (uint32_t *      dest,
   836                                     const uint32_t *src,
   837                                     int             width)
   838 {
   839     int i;
   840     vector unsigned int vdest, vsrc;
   841     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   842 	dest_mask, src_mask, store_mask;
   844     COMPUTE_SHIFT_MASKS (dest, src);
   846     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   847     for (i = width / 4; i > 0; i--)
   848     {
   849 	LOAD_VECTORS (dest, src);
   851 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
   852 			     vsrc, splat_alpha (negate (vdest)));
   854 	STORE_VECTOR (dest);
   856 	src += 4;
   857 	dest += 4;
   858     }
   860     for (i = width % 4; --i >= 0;)
   861     {
   862 	uint32_t s = src[i];
   863 	uint32_t d = dest[i];
   864 	uint32_t src_a = ALPHA_8 (s);
   865 	uint32_t dest_ia = ALPHA_8 (~d);
   867 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
   869 	dest[i] = s;
   870     }
   871 }
   873 static void
   874 vmx_combine_atop_reverse_u_mask (uint32_t *      dest,
   875                                  const uint32_t *src,
   876                                  const uint32_t *mask,
   877                                  int             width)
   878 {
   879     int i;
   880     vector unsigned int vdest, vsrc, vmask;
   881     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   882 	dest_mask, src_mask, mask_mask, store_mask;
   884     COMPUTE_SHIFT_MASKC (dest, src, mask);
   886     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   887     for (i = width / 4; i > 0; i--)
   888     {
   889 	LOAD_VECTORSM (dest, src, mask);
   891 	vdest = pix_add_mul (vdest, splat_alpha (vsrc),
   892 			     vsrc, splat_alpha (negate (vdest)));
   894 	STORE_VECTOR (dest);
   896 	src += 4;
   897 	dest += 4;
   898 	mask += 4;
   899     }
   901     for (i = width % 4; --i >= 0;)
   902     {
   903 	uint32_t m = ALPHA_8 (mask[i]);
   904 	uint32_t s = src[i];
   905 	uint32_t d = dest[i];
   906 	uint32_t src_a;
   907 	uint32_t dest_ia = ALPHA_8 (~d);
   909 	UN8x4_MUL_UN8 (s, m);
   911 	src_a = ALPHA_8 (s);
   913 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_a);
   915 	dest[i] = s;
   916     }
   917 }
   919 static void
   920 vmx_combine_atop_reverse_u (pixman_implementation_t *imp,
   921                             pixman_op_t              op,
   922                             uint32_t *               dest,
   923                             const uint32_t *         src,
   924                             const uint32_t *         mask,
   925                             int                      width)
   926 {
   927     if (mask)
   928 	vmx_combine_atop_reverse_u_mask (dest, src, mask, width);
   929     else
   930 	vmx_combine_atop_reverse_u_no_mask (dest, src, width);
   931 }
   933 static void
   934 vmx_combine_xor_u_no_mask (uint32_t *      dest,
   935                            const uint32_t *src,
   936                            int             width)
   937 {
   938     int i;
   939     vector unsigned int vdest, vsrc;
   940     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   941 	dest_mask, src_mask, store_mask;
   943     COMPUTE_SHIFT_MASKS (dest, src);
   945     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   946     for (i = width / 4; i > 0; i--)
   947     {
   948 	LOAD_VECTORS (dest, src);
   950 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
   951 			     vdest, splat_alpha (negate (vsrc)));
   953 	STORE_VECTOR (dest);
   955 	src += 4;
   956 	dest += 4;
   957     }
   959     for (i = width % 4; --i >= 0;)
   960     {
   961 	uint32_t s = src[i];
   962 	uint32_t d = dest[i];
   963 	uint32_t src_ia = ALPHA_8 (~s);
   964 	uint32_t dest_ia = ALPHA_8 (~d);
   966 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
   968 	dest[i] = s;
   969     }
   970 }
   972 static void
   973 vmx_combine_xor_u_mask (uint32_t *      dest,
   974                         const uint32_t *src,
   975                         const uint32_t *mask,
   976                         int             width)
   977 {
   978     int i;
   979     vector unsigned int vdest, vsrc, vmask;
   980     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
   981 	dest_mask, src_mask, mask_mask, store_mask;
   983     COMPUTE_SHIFT_MASKC (dest, src, mask);
   985     /* printf ("%s\n",__PRETTY_FUNCTION__); */
   986     for (i = width / 4; i > 0; i--)
   987     {
   988 	LOAD_VECTORSM (dest, src, mask);
   990 	vdest = pix_add_mul (vsrc, splat_alpha (negate (vdest)),
   991 			     vdest, splat_alpha (negate (vsrc)));
   993 	STORE_VECTOR (dest);
   995 	src += 4;
   996 	dest += 4;
   997 	mask += 4;
   998     }
  1000     for (i = width % 4; --i >= 0;)
  1002 	uint32_t m = ALPHA_8 (mask[i]);
  1003 	uint32_t s = src[i];
  1004 	uint32_t d = dest[i];
  1005 	uint32_t src_ia;
  1006 	uint32_t dest_ia = ALPHA_8 (~d);
  1008 	UN8x4_MUL_UN8 (s, m);
  1010 	src_ia = ALPHA_8 (~s);
  1012 	UN8x4_MUL_UN8_ADD_UN8x4_MUL_UN8 (s, dest_ia, d, src_ia);
  1014 	dest[i] = s;
  1018 static void
  1019 vmx_combine_xor_u (pixman_implementation_t *imp,
  1020                    pixman_op_t              op,
  1021                    uint32_t *               dest,
  1022                    const uint32_t *         src,
  1023                    const uint32_t *         mask,
  1024                    int                      width)
  1026     if (mask)
  1027 	vmx_combine_xor_u_mask (dest, src, mask, width);
  1028     else
  1029 	vmx_combine_xor_u_no_mask (dest, src, width);
  1032 static void
  1033 vmx_combine_add_u_no_mask (uint32_t *      dest,
  1034                            const uint32_t *src,
  1035                            int             width)
  1037     int i;
  1038     vector unsigned int vdest, vsrc;
  1039     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1040 	dest_mask, src_mask, store_mask;
  1042     COMPUTE_SHIFT_MASKS (dest, src);
  1043     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1044     for (i = width / 4; i > 0; i--)
  1046 	LOAD_VECTORS (dest, src);
  1048 	vdest = pix_add (vsrc, vdest);
  1050 	STORE_VECTOR (dest);
  1052 	src += 4;
  1053 	dest += 4;
  1056     for (i = width % 4; --i >= 0;)
  1058 	uint32_t s = src[i];
  1059 	uint32_t d = dest[i];
  1061 	UN8x4_ADD_UN8x4 (d, s);
  1063 	dest[i] = d;
  1067 static void
  1068 vmx_combine_add_u_mask (uint32_t *      dest,
  1069                         const uint32_t *src,
  1070                         const uint32_t *mask,
  1071                         int             width)
  1073     int i;
  1074     vector unsigned int vdest, vsrc, vmask;
  1075     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1076 	dest_mask, src_mask, mask_mask, store_mask;
  1078     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1080     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1081     for (i = width / 4; i > 0; i--)
  1083 	LOAD_VECTORSM (dest, src, mask);
  1085 	vdest = pix_add (vsrc, vdest);
  1087 	STORE_VECTOR (dest);
  1089 	src += 4;
  1090 	dest += 4;
  1091 	mask += 4;
  1094     for (i = width % 4; --i >= 0;)
  1096 	uint32_t m = ALPHA_8 (mask[i]);
  1097 	uint32_t s = src[i];
  1098 	uint32_t d = dest[i];
  1100 	UN8x4_MUL_UN8 (s, m);
  1101 	UN8x4_ADD_UN8x4 (d, s);
  1103 	dest[i] = d;
  1107 static void
  1108 vmx_combine_add_u (pixman_implementation_t *imp,
  1109                    pixman_op_t              op,
  1110                    uint32_t *               dest,
  1111                    const uint32_t *         src,
  1112                    const uint32_t *         mask,
  1113                    int                      width)
  1115     if (mask)
  1116 	vmx_combine_add_u_mask (dest, src, mask, width);
  1117     else
  1118 	vmx_combine_add_u_no_mask (dest, src, width);
  1121 static void
  1122 vmx_combine_src_ca (pixman_implementation_t *imp,
  1123                     pixman_op_t              op,
  1124                     uint32_t *               dest,
  1125                     const uint32_t *         src,
  1126                     const uint32_t *         mask,
  1127                     int                      width)
  1129     int i;
  1130     vector unsigned int vdest, vsrc, vmask;
  1131     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1132 	dest_mask, mask_mask, src_mask, store_mask;
  1134     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1136     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1137     for (i = width / 4; i > 0; i--)
  1139 	LOAD_VECTORSC (dest, src, mask);
  1141 	vdest = pix_multiply (vsrc, vmask);
  1143 	STORE_VECTOR (dest);
  1145 	mask += 4;
  1146 	src += 4;
  1147 	dest += 4;
  1150     for (i = width % 4; --i >= 0;)
  1152 	uint32_t a = mask[i];
  1153 	uint32_t s = src[i];
  1155 	UN8x4_MUL_UN8x4 (s, a);
  1157 	dest[i] = s;
  1161 static void
  1162 vmx_combine_over_ca (pixman_implementation_t *imp,
  1163                      pixman_op_t              op,
  1164                      uint32_t *               dest,
  1165                      const uint32_t *         src,
  1166                      const uint32_t *         mask,
  1167                      int                      width)
  1169     int i;
  1170     vector unsigned int vdest, vsrc, vmask;
  1171     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1172 	dest_mask, mask_mask, src_mask, store_mask;
  1174     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1176     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1177     for (i = width / 4; i > 0; i--)
  1179 	LOAD_VECTORSC (dest, src, mask);
  1181 	vdest = in_over (vsrc, splat_alpha (vsrc), vmask, vdest);
  1183 	STORE_VECTOR (dest);
  1185 	mask += 4;
  1186 	src += 4;
  1187 	dest += 4;
  1190     for (i = width % 4; --i >= 0;)
  1192 	uint32_t a = mask[i];
  1193 	uint32_t s = src[i];
  1194 	uint32_t d = dest[i];
  1195 	uint32_t sa = ALPHA_8 (s);
  1197 	UN8x4_MUL_UN8x4 (s, a);
  1198 	UN8x4_MUL_UN8 (a, sa);
  1199 	UN8x4_MUL_UN8x4_ADD_UN8x4 (d, ~a, s);
  1201 	dest[i] = d;
  1205 static void
  1206 vmx_combine_over_reverse_ca (pixman_implementation_t *imp,
  1207                              pixman_op_t              op,
  1208                              uint32_t *               dest,
  1209                              const uint32_t *         src,
  1210                              const uint32_t *         mask,
  1211                              int                      width)
  1213     int i;
  1214     vector unsigned int vdest, vsrc, vmask;
  1215     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1216 	dest_mask, mask_mask, src_mask, store_mask;
  1218     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1220     /* printf("%s\n",__PRETTY_FUNCTION__); */
  1221     for (i = width / 4; i > 0; i--)
  1223 	LOAD_VECTORSC (dest, src, mask);
  1225 	vdest = over (vdest, splat_alpha (vdest), pix_multiply (vsrc, vmask));
  1227 	STORE_VECTOR (dest);
  1229 	mask += 4;
  1230 	src += 4;
  1231 	dest += 4;
  1234     for (i = width % 4; --i >= 0;)
  1236 	uint32_t a = mask[i];
  1237 	uint32_t s = src[i];
  1238 	uint32_t d = dest[i];
  1239 	uint32_t ida = ALPHA_8 (~d);
  1241 	UN8x4_MUL_UN8x4 (s, a);
  1242 	UN8x4_MUL_UN8_ADD_UN8x4 (s, ida, d);
  1244 	dest[i] = s;
  1248 static void
  1249 vmx_combine_in_ca (pixman_implementation_t *imp,
  1250                    pixman_op_t              op,
  1251                    uint32_t *               dest,
  1252                    const uint32_t *         src,
  1253                    const uint32_t *         mask,
  1254                    int                      width)
  1256     int i;
  1257     vector unsigned int vdest, vsrc, vmask;
  1258     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1259 	dest_mask, mask_mask, src_mask, store_mask;
  1261     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1263     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1264     for (i = width / 4; i > 0; i--)
  1266 	LOAD_VECTORSC (dest, src, mask);
  1268 	vdest = pix_multiply (pix_multiply (vsrc, vmask), splat_alpha (vdest));
  1270 	STORE_VECTOR (dest);
  1272 	src += 4;
  1273 	dest += 4;
  1274 	mask += 4;
  1277     for (i = width % 4; --i >= 0;)
  1279 	uint32_t a = mask[i];
  1280 	uint32_t s = src[i];
  1281 	uint32_t da = ALPHA_8 (dest[i]);
  1283 	UN8x4_MUL_UN8x4 (s, a);
  1284 	UN8x4_MUL_UN8 (s, da);
  1286 	dest[i] = s;
  1290 static void
  1291 vmx_combine_in_reverse_ca (pixman_implementation_t *imp,
  1292                            pixman_op_t              op,
  1293                            uint32_t *               dest,
  1294                            const uint32_t *         src,
  1295                            const uint32_t *         mask,
  1296                            int                      width)
  1298     int i;
  1299     vector unsigned int vdest, vsrc, vmask;
  1300     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1301 	dest_mask, mask_mask, src_mask, store_mask;
  1303     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1305     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1306     for (i = width / 4; i > 0; i--)
  1309 	LOAD_VECTORSC (dest, src, mask);
  1311 	vdest = pix_multiply (vdest, pix_multiply (vmask, splat_alpha (vsrc)));
  1313 	STORE_VECTOR (dest);
  1315 	src += 4;
  1316 	dest += 4;
  1317 	mask += 4;
  1320     for (i = width % 4; --i >= 0;)
  1322 	uint32_t a = mask[i];
  1323 	uint32_t d = dest[i];
  1324 	uint32_t sa = ALPHA_8 (src[i]);
  1326 	UN8x4_MUL_UN8 (a, sa);
  1327 	UN8x4_MUL_UN8x4 (d, a);
  1329 	dest[i] = d;
  1333 static void
  1334 vmx_combine_out_ca (pixman_implementation_t *imp,
  1335                     pixman_op_t              op,
  1336                     uint32_t *               dest,
  1337                     const uint32_t *         src,
  1338                     const uint32_t *         mask,
  1339                     int                      width)
  1341     int i;
  1342     vector unsigned int vdest, vsrc, vmask;
  1343     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1344 	dest_mask, mask_mask, src_mask, store_mask;
  1346     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1348     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1349     for (i = width / 4; i > 0; i--)
  1351 	LOAD_VECTORSC (dest, src, mask);
  1353 	vdest = pix_multiply (
  1354 	    pix_multiply (vsrc, vmask), splat_alpha (negate (vdest)));
  1356 	STORE_VECTOR (dest);
  1358 	src += 4;
  1359 	dest += 4;
  1360 	mask += 4;
  1363     for (i = width % 4; --i >= 0;)
  1365 	uint32_t a = mask[i];
  1366 	uint32_t s = src[i];
  1367 	uint32_t d = dest[i];
  1368 	uint32_t da = ALPHA_8 (~d);
  1370 	UN8x4_MUL_UN8x4 (s, a);
  1371 	UN8x4_MUL_UN8 (s, da);
  1373 	dest[i] = s;
  1377 static void
  1378 vmx_combine_out_reverse_ca (pixman_implementation_t *imp,
  1379                             pixman_op_t              op,
  1380                             uint32_t *               dest,
  1381                             const uint32_t *         src,
  1382                             const uint32_t *         mask,
  1383                             int                      width)
  1385     int i;
  1386     vector unsigned int vdest, vsrc, vmask;
  1387     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1388 	dest_mask, mask_mask, src_mask, store_mask;
  1390     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1392     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1393     for (i = width / 4; i > 0; i--)
  1395 	LOAD_VECTORSC (dest, src, mask);
  1397 	vdest = pix_multiply (
  1398 	    vdest, negate (pix_multiply (vmask, splat_alpha (vsrc))));
  1400 	STORE_VECTOR (dest);
  1402 	src += 4;
  1403 	dest += 4;
  1404 	mask += 4;
  1407     for (i = width % 4; --i >= 0;)
  1409 	uint32_t a = mask[i];
  1410 	uint32_t s = src[i];
  1411 	uint32_t d = dest[i];
  1412 	uint32_t sa = ALPHA_8 (s);
  1414 	UN8x4_MUL_UN8 (a, sa);
  1415 	UN8x4_MUL_UN8x4 (d, ~a);
  1417 	dest[i] = d;
  1421 static void
  1422 vmx_combine_atop_ca (pixman_implementation_t *imp,
  1423                      pixman_op_t              op,
  1424                      uint32_t *               dest,
  1425                      const uint32_t *         src,
  1426                      const uint32_t *         mask,
  1427                      int                      width)
  1429     int i;
  1430     vector unsigned int vdest, vsrc, vmask, vsrca;
  1431     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1432 	dest_mask, mask_mask, src_mask, store_mask;
  1434     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1436     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1437     for (i = width / 4; i > 0; i--)
  1439 	LOAD_VECTORSC (dest, src, mask);
  1441 	vsrca = splat_alpha (vsrc);
  1443 	vsrc = pix_multiply (vsrc, vmask);
  1444 	vmask = pix_multiply (vmask, vsrca);
  1446 	vdest = pix_add_mul (vsrc, splat_alpha (vdest),
  1447 			     negate (vmask), vdest);
  1449 	STORE_VECTOR (dest);
  1451 	src += 4;
  1452 	dest += 4;
  1453 	mask += 4;
  1456     for (i = width % 4; --i >= 0;)
  1458 	uint32_t a = mask[i];
  1459 	uint32_t s = src[i];
  1460 	uint32_t d = dest[i];
  1461 	uint32_t sa = ALPHA_8 (s);
  1462 	uint32_t da = ALPHA_8 (d);
  1464 	UN8x4_MUL_UN8x4 (s, a);
  1465 	UN8x4_MUL_UN8 (a, sa);
  1466 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
  1468 	dest[i] = d;
  1472 static void
  1473 vmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
  1474                              pixman_op_t              op,
  1475                              uint32_t *               dest,
  1476                              const uint32_t *         src,
  1477                              const uint32_t *         mask,
  1478                              int                      width)
  1480     int i;
  1481     vector unsigned int vdest, vsrc, vmask;
  1482     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1483 	dest_mask, mask_mask, src_mask, store_mask;
  1485     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1487     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1488     for (i = width / 4; i > 0; i--)
  1490 	LOAD_VECTORSC (dest, src, mask);
  1492 	vdest = pix_add_mul (vdest,
  1493 			     pix_multiply (vmask, splat_alpha (vsrc)),
  1494 			     pix_multiply (vsrc, vmask),
  1495 			     negate (splat_alpha (vdest)));
  1497 	STORE_VECTOR (dest);
  1499 	src += 4;
  1500 	dest += 4;
  1501 	mask += 4;
  1504     for (i = width % 4; --i >= 0;)
  1506 	uint32_t a = mask[i];
  1507 	uint32_t s = src[i];
  1508 	uint32_t d = dest[i];
  1509 	uint32_t sa = ALPHA_8 (s);
  1510 	uint32_t da = ALPHA_8 (~d);
  1512 	UN8x4_MUL_UN8x4 (s, a);
  1513 	UN8x4_MUL_UN8 (a, sa);
  1514 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, a, s, da);
  1516 	dest[i] = d;
  1520 static void
  1521 vmx_combine_xor_ca (pixman_implementation_t *imp,
  1522                     pixman_op_t              op,
  1523                     uint32_t *               dest,
  1524                     const uint32_t *         src,
  1525                     const uint32_t *         mask,
  1526                     int                      width)
  1528     int i;
  1529     vector unsigned int vdest, vsrc, vmask;
  1530     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1531 	dest_mask, mask_mask, src_mask, store_mask;
  1533     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1535     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1536     for (i = width / 4; i > 0; i--)
  1538 	LOAD_VECTORSC (dest, src, mask);
  1540 	vdest = pix_add_mul (vdest,
  1541 			     negate (pix_multiply (vmask, splat_alpha (vsrc))),
  1542 			     pix_multiply (vsrc, vmask),
  1543 			     negate (splat_alpha (vdest)));
  1545 	STORE_VECTOR (dest);
  1547 	src += 4;
  1548 	dest += 4;
  1549 	mask += 4;
  1552     for (i = width % 4; --i >= 0;)
  1554 	uint32_t a = mask[i];
  1555 	uint32_t s = src[i];
  1556 	uint32_t d = dest[i];
  1557 	uint32_t sa = ALPHA_8 (s);
  1558 	uint32_t da = ALPHA_8 (~d);
  1560 	UN8x4_MUL_UN8x4 (s, a);
  1561 	UN8x4_MUL_UN8 (a, sa);
  1562 	UN8x4_MUL_UN8x4_ADD_UN8x4_MUL_UN8 (d, ~a, s, da);
  1564 	dest[i] = d;
  1568 static void
  1569 vmx_combine_add_ca (pixman_implementation_t *imp,
  1570                     pixman_op_t              op,
  1571                     uint32_t *               dest,
  1572                     const uint32_t *         src,
  1573                     const uint32_t *         mask,
  1574                     int                      width)
  1576     int i;
  1577     vector unsigned int vdest, vsrc, vmask;
  1578     vector unsigned char tmp1, tmp2, tmp3, tmp4, edges,
  1579 	dest_mask, mask_mask, src_mask, store_mask;
  1581     COMPUTE_SHIFT_MASKC (dest, src, mask);
  1583     /* printf ("%s\n",__PRETTY_FUNCTION__); */
  1584     for (i = width / 4; i > 0; i--)
  1586 	LOAD_VECTORSC (dest, src, mask);
  1588 	vdest = pix_add (pix_multiply (vsrc, vmask), vdest);
  1590 	STORE_VECTOR (dest);
  1592 	src += 4;
  1593 	dest += 4;
  1594 	mask += 4;
  1597     for (i = width % 4; --i >= 0;)
  1599 	uint32_t a = mask[i];
  1600 	uint32_t s = src[i];
  1601 	uint32_t d = dest[i];
  1603 	UN8x4_MUL_UN8x4 (s, a);
  1604 	UN8x4_ADD_UN8x4 (s, d);
  1606 	dest[i] = s;
  1610 static const pixman_fast_path_t vmx_fast_paths[] =
  1612     {   PIXMAN_OP_NONE	},
  1613 };
  1615 pixman_implementation_t *
  1616 _pixman_implementation_create_vmx (pixman_implementation_t *fallback)
  1618     pixman_implementation_t *imp = _pixman_implementation_create (fallback, vmx_fast_paths);
  1620     /* Set up function pointers */
  1622     imp->combine_32[PIXMAN_OP_OVER] = vmx_combine_over_u;
  1623     imp->combine_32[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_u;
  1624     imp->combine_32[PIXMAN_OP_IN] = vmx_combine_in_u;
  1625     imp->combine_32[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_u;
  1626     imp->combine_32[PIXMAN_OP_OUT] = vmx_combine_out_u;
  1627     imp->combine_32[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_u;
  1628     imp->combine_32[PIXMAN_OP_ATOP] = vmx_combine_atop_u;
  1629     imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_u;
  1630     imp->combine_32[PIXMAN_OP_XOR] = vmx_combine_xor_u;
  1632     imp->combine_32[PIXMAN_OP_ADD] = vmx_combine_add_u;
  1634     imp->combine_32_ca[PIXMAN_OP_SRC] = vmx_combine_src_ca;
  1635     imp->combine_32_ca[PIXMAN_OP_OVER] = vmx_combine_over_ca;
  1636     imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = vmx_combine_over_reverse_ca;
  1637     imp->combine_32_ca[PIXMAN_OP_IN] = vmx_combine_in_ca;
  1638     imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = vmx_combine_in_reverse_ca;
  1639     imp->combine_32_ca[PIXMAN_OP_OUT] = vmx_combine_out_ca;
  1640     imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = vmx_combine_out_reverse_ca;
  1641     imp->combine_32_ca[PIXMAN_OP_ATOP] = vmx_combine_atop_ca;
  1642     imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = vmx_combine_atop_reverse_ca;
  1643     imp->combine_32_ca[PIXMAN_OP_XOR] = vmx_combine_xor_ca;
  1644     imp->combine_32_ca[PIXMAN_OP_ADD] = vmx_combine_add_ca;
  1646     return imp;

mercurial