The Tor Browser: gfx/skia/trunk/src/opts/SkBlitRow_opts_arm

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*

     2  * Copyright 2012 The Android Open Source Project

     3  *

     4  * Use of this source code is governed by a BSD-style license that can be

     5  * found in the LICENSE file.

     6  */

     8 #include "SkBlitRow_opts_arm_neon.h"

    10 #include "SkBlitMask.h"

    11 #include "SkBlitRow.h"

    12 #include "SkColorPriv.h"

    13 #include "SkDither.h"

    14 #include "SkMathPriv.h"

    15 #include "SkUtils.h"

    17 #include "SkCachePreload_arm.h"

    18 #include "SkColor_opts_neon.h"

    19 #include <arm_neon.h>

    21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

    22                            const SkPMColor* SK_RESTRICT src, int count,

    23                            U8CPU alpha, int /*x*/, int /*y*/) {

    24     SkASSERT(255 == alpha);

    26     while (count >= 8) {

    27         uint8x8x4_t vsrc;

    28         uint16x8_t vdst;

    30         // Load

    31         vsrc = vld4_u8((uint8_t*)src);

    33         // Convert src to 565

    34         vdst = SkPixel32ToPixel16_neon8(vsrc);

    36         // Store

    37         vst1q_u16(dst, vdst);

    39         // Prepare next iteration

    40         dst += 8;

    41         src += 8;

    42         count -= 8;

    43     };

    45     // Leftovers

    46     while (count > 0) {

    47         SkPMColor c = *src++;

    48         SkPMColorAssert(c);

    49         *dst = SkPixel32ToPixel16_ToU16(c);

    50         dst++;

    51         count--;

    52     };

    53 }

    55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,

    56                            const SkPMColor* SK_RESTRICT src, int count,

    57                            U8CPU alpha, int /*x*/, int /*y*/) {

    58     SkASSERT(255 == alpha);

    60     if (count >= 8) {

    61         uint16_t* SK_RESTRICT keep_dst = 0;

    63         asm volatile (

    64                       "ands       ip, %[count], #7            \n\t"

    65                       "vmov.u8    d31, #1<<7                  \n\t"

    66                       "vld1.16    {q12}, [%[dst]]             \n\t"

    67                       "vld4.8     {d0-d3}, [%[src]]           \n\t"

    68                       // Thumb does not support the standard ARM conditional

    69                       // instructions but instead requires the 'it' instruction

    70                       // to signal conditional execution

    71                       "it eq                                  \n\t"

    72                       "moveq      ip, #8                      \n\t"

    73                       "mov        %[keep_dst], %[dst]         \n\t"

    75                       "add        %[src], %[src], ip, LSL#2   \n\t"

    76                       "add        %[dst], %[dst], ip, LSL#1   \n\t"

    77                       "subs       %[count], %[count], ip      \n\t"

    78                       "b          9f                          \n\t"

    79                       // LOOP

    80                       "2:                                         \n\t"

    82                       "vld1.16    {q12}, [%[dst]]!            \n\t"

    83                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"

    84                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"

    85                       "sub        %[keep_dst], %[dst], #8*2   \n\t"

    86                       "subs       %[count], %[count], #8      \n\t"

    87                       "9:                                         \n\t"

    88                       "pld        [%[dst],#32]                \n\t"

    89                       // expand 0565 q12 to 8888 {d4-d7}

    90                       "vmovn.u16  d4, q12                     \n\t"

    91                       "vshr.u16   q11, q12, #5                \n\t"

    92                       "vshr.u16   q10, q12, #6+5              \n\t"

    93                       "vmovn.u16  d5, q11                     \n\t"

    94                       "vmovn.u16  d6, q10                     \n\t"

    95                       "vshl.u8    d4, d4, #3                  \n\t"

    96                       "vshl.u8    d5, d5, #2                  \n\t"

    97                       "vshl.u8    d6, d6, #3                  \n\t"

    99                       "vmovl.u8   q14, d31                    \n\t"

   100                       "vmovl.u8   q13, d31                    \n\t"

   101                       "vmovl.u8   q12, d31                    \n\t"

   103                       // duplicate in 4/2/1 & 8pix vsns

   104                       "vmvn.8     d30, d3                     \n\t"

   105                       "vmlal.u8   q14, d30, d6                \n\t"

   106                       "vmlal.u8   q13, d30, d5                \n\t"

   107                       "vmlal.u8   q12, d30, d4                \n\t"

   108                       "vshr.u16   q8, q14, #5                 \n\t"

   109                       "vshr.u16   q9, q13, #6                 \n\t"

   110                       "vaddhn.u16 d6, q14, q8                 \n\t"

   111                       "vshr.u16   q8, q12, #5                 \n\t"

   112                       "vaddhn.u16 d5, q13, q9                 \n\t"

   113                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up

   114                       "vaddhn.u16 d4, q12, q8                 \n\t"

   115                       // intentionally don't calculate alpha

   116                       // result in d4-d6

   118                       "vqadd.u8   d5, d5, d1                  \n\t"

   119                       "vqadd.u8   d4, d4, d2                  \n\t"

   121                       // pack 8888 {d4-d6} to 0565 q10

   122                       "vshll.u8   q10, d6, #8                 \n\t"

   123                       "vshll.u8   q3, d5, #8                  \n\t"

   124                       "vshll.u8   q2, d4, #8                  \n\t"

   125                       "vsri.u16   q10, q3, #5                 \n\t"

   126                       "vsri.u16   q10, q2, #11                \n\t"

   128                       "bne        2b                          \n\t"

   130                       "1:                                         \n\t"

   131                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"

   132                       : [count] "+r" (count)

   133                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)

   134                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",

   135                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",

   136                       "d30","d31"

   137                       );

   138     }

   139     else

   140     {   // handle count < 8

   141         uint16_t* SK_RESTRICT keep_dst = 0;

   143         asm volatile (

   144                       "vmov.u8    d31, #1<<7                  \n\t"

   145                       "mov        %[keep_dst], %[dst]         \n\t"

   147                       "tst        %[count], #4                \n\t"

   148                       "beq        14f                         \n\t"

   149                       "vld1.16    {d25}, [%[dst]]!            \n\t"

   150                       "vld1.32    {q1}, [%[src]]!             \n\t"

   152                       "14:                                        \n\t"

   153                       "tst        %[count], #2                \n\t"

   154                       "beq        12f                         \n\t"

   155                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"

   156                       "vld1.32    {d1}, [%[src]]!             \n\t"

   158                       "12:                                        \n\t"

   159                       "tst        %[count], #1                \n\t"

   160                       "beq        11f                         \n\t"

   161                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"

   162                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"

   164                       "11:                                        \n\t"

   165                       // unzips achieve the same as a vld4 operation

   166                       "vuzpq.u16  q0, q1                      \n\t"

   167                       "vuzp.u8    d0, d1                      \n\t"

   168                       "vuzp.u8    d2, d3                      \n\t"

   169                       // expand 0565 q12 to 8888 {d4-d7}

   170                       "vmovn.u16  d4, q12                     \n\t"

   171                       "vshr.u16   q11, q12, #5                \n\t"

   172                       "vshr.u16   q10, q12, #6+5              \n\t"

   173                       "vmovn.u16  d5, q11                     \n\t"

   174                       "vmovn.u16  d6, q10                     \n\t"

   175                       "vshl.u8    d4, d4, #3                  \n\t"

   176                       "vshl.u8    d5, d5, #2                  \n\t"

   177                       "vshl.u8    d6, d6, #3                  \n\t"

   179                       "vmovl.u8   q14, d31                    \n\t"

   180                       "vmovl.u8   q13, d31                    \n\t"

   181                       "vmovl.u8   q12, d31                    \n\t"

   183                       // duplicate in 4/2/1 & 8pix vsns

   184                       "vmvn.8     d30, d3                     \n\t"

   185                       "vmlal.u8   q14, d30, d6                \n\t"

   186                       "vmlal.u8   q13, d30, d5                \n\t"

   187                       "vmlal.u8   q12, d30, d4                \n\t"

   188                       "vshr.u16   q8, q14, #5                 \n\t"

   189                       "vshr.u16   q9, q13, #6                 \n\t"

   190                       "vaddhn.u16 d6, q14, q8                 \n\t"

   191                       "vshr.u16   q8, q12, #5                 \n\t"

   192                       "vaddhn.u16 d5, q13, q9                 \n\t"

   193                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up

   194                       "vaddhn.u16 d4, q12, q8                 \n\t"

   195                       // intentionally don't calculate alpha

   196                       // result in d4-d6

   198                       "vqadd.u8   d5, d5, d1                  \n\t"

   199                       "vqadd.u8   d4, d4, d2                  \n\t"

   201                       // pack 8888 {d4-d6} to 0565 q10

   202                       "vshll.u8   q10, d6, #8                 \n\t"

   203                       "vshll.u8   q3, d5, #8                  \n\t"

   204                       "vshll.u8   q2, d4, #8                  \n\t"

   205                       "vsri.u16   q10, q3, #5                 \n\t"

   206                       "vsri.u16   q10, q2, #11                \n\t"

   208                       // store

   209                       "tst        %[count], #4                \n\t"

   210                       "beq        24f                         \n\t"

   211                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"

   213                       "24:                                        \n\t"

   214                       "tst        %[count], #2                \n\t"

   215                       "beq        22f                         \n\t"

   216                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"

   218                       "22:                                        \n\t"

   219                       "tst        %[count], #1                \n\t"

   220                       "beq        21f                         \n\t"

   221                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"

   223                       "21:                                        \n\t"

   224                       : [count] "+r" (count)

   225                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)

   226                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",

   227                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",

   228                       "d30","d31"

   229                       );

   230     }

   231 }

   233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {

   234     prod += vdupq_n_u16(128);

   235     prod += vshrq_n_u16(prod, 8);

   236     return vshrq_n_u16(prod, 8);

   237 }

   239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,

   240                           const SkPMColor* SK_RESTRICT src, int count,

   241                           U8CPU alpha, int /*x*/, int /*y*/) {

   242    SkASSERT(255 > alpha);

   244     /* This code implements a Neon version of S32A_D565_Blend. The results have

   245      * a few mismatches compared to the original code. These mismatches never

   246      * exceed 1.

   247      */

   249     if (count >= 8) {

   250         uint16x8_t valpha_max, vmask_blue;

   251         uint8x8_t valpha;

   253         // prepare constants

   254         valpha_max = vmovq_n_u16(255);

   255         valpha = vdup_n_u8(alpha);

   256         vmask_blue = vmovq_n_u16(SK_B16_MASK);

   258         do {

   259             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;

   260             uint16x8_t vres_a, vres_r, vres_g, vres_b;

   261             uint8x8x4_t vsrc;

   263             // load pixels

   264             vdst = vld1q_u16(dst);

   265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))

   266             asm (

   267                 "vld4.u8 %h[vsrc], [%[src]]!"

   268                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)

   269                 : :

   270             );

   271 #else

   272             register uint8x8_t d0 asm("d0");

   273             register uint8x8_t d1 asm("d1");

   274             register uint8x8_t d2 asm("d2");

   275             register uint8x8_t d3 asm("d3");

   277             asm volatile (

   278                 "vld4.u8    {d0-d3},[%[src]]!;"

   279                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),

   280                   [src] "+&r" (src)

   281                 : :

   282             );

   283             vsrc.val[0] = d0;

   284             vsrc.val[1] = d1;

   285             vsrc.val[2] = d2;

   286             vsrc.val[3] = d3;

   287 #endif

   290             // deinterleave dst

   291             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes

   292             vdst_b = vdst & vmask_blue;                     // extract blue

   293             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red

   294             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green

   296             // shift src to 565

   297             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);

   298             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);

   299             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);

   301             // calc src * src_scale

   302             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);

   303             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);

   304             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);

   305             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);

   307             // prepare dst_scale

   308             vres_a = SkDiv255Round_neon8(vres_a);

   309             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255

   311             // add dst * dst_scale to previous result

   312             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);

   313             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);

   314             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);

   316 #ifdef S32A_D565_BLEND_EXACT

   317             // It is possible to get exact results with this but it is slow,

   318             // even slower than C code in some cases

   319             vres_r = SkDiv255Round_neon8(vres_r);

   320             vres_g = SkDiv255Round_neon8(vres_g);

   321             vres_b = SkDiv255Round_neon8(vres_b);

   322 #else

   323             vres_r = vrshrq_n_u16(vres_r, 8);

   324             vres_g = vrshrq_n_u16(vres_g, 8);

   325             vres_b = vrshrq_n_u16(vres_b, 8);

   326 #endif

   327             // pack result

   328             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue

   329             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue

   331             // store

   332             vst1q_u16(dst, vres_b);

   333             dst += 8;

   334             count -= 8;

   335         } while (count >= 8);

   336     }

   338     // leftovers

   339     while (count-- > 0) {

   340         SkPMColor sc = *src++;

   341         if (sc) {

   342             uint16_t dc = *dst;

   343             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);

   344             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);

   345             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);

   346             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);

   347             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));

   348         }

   349         dst += 1;

   350     }

   351 }

   353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.

   354  * each dither value is spaced out into byte lanes, and repeated

   355  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the

   356  * start of each row.

   357  */

   358 static const uint8_t gDitherMatrix_Neon[48] = {

   359     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,

   360     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,

   361     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,

   362     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,

   364 };

   366 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,

   367                                 int count, U8CPU alpha, int x, int y)

   368 {

   370     SkASSERT(255 > alpha);

   372     // rescale alpha to range 1 - 256

   373     int scale = SkAlpha255To256(alpha);

   375     if (count >= 8) {

   376         /* select row and offset for dither array */

   377         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];

   379         uint8x8_t vdither = vld1_u8(dstart);         // load dither values

   380         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values

   382         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg

   383         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask

   385         do {

   387             uint8x8_t vsrc_r, vsrc_g, vsrc_b;

   388             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;

   389             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;

   390             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;

   391             uint16x8_t vdst;

   392             uint16x8_t vdst_r, vdst_g, vdst_b;

   393             int16x8_t vres_r, vres_g, vres_b;

   394             int8x8_t vres8_r, vres8_g, vres8_b;

   396             // Load source and add dither

   397             {

   398             register uint8x8_t d0 asm("d0");

   399             register uint8x8_t d1 asm("d1");

   400             register uint8x8_t d2 asm("d2");

   401             register uint8x8_t d3 asm("d3");

   403             asm (

   404                 "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"

   405                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

   406                 :

   407             );

   408             vsrc_g = d1;

   409 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

   410             vsrc_r = d2; vsrc_b = d0;

   411 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

   412             vsrc_r = d0; vsrc_b = d2;

   413 #endif

   414             }

   416             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6

   417             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5

   418             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5

   420             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen

   421             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen

   422             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen

   424             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result

   425             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result

   426             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result

   428             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);

   429             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);

   430             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);

   432             // Load dst and unpack

   433             vdst = vld1q_u16(dst);

   434             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green

   435             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red

   436             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue

   438             // subtract dst from src and widen

   439             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));

   440             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));

   441             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));

   443             // multiply diffs by scale and shift

   444             vres_r = vmulq_s16(vres_r, vscale);

   445             vres_g = vmulq_s16(vres_g, vscale);

   446             vres_b = vmulq_s16(vres_b, vscale);

   448             vres8_r = vshrn_n_s16(vres_r, 8);

   449             vres8_g = vshrn_n_s16(vres_g, 8);

   450             vres8_b = vshrn_n_s16(vres_b, 8);

   452             // add dst to result

   453             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);

   454             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);

   455             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);

   457             // put result into 565 format

   458             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue

   459             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue

   461             // Store result

   462             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));

   464             // Next iteration

   465             dst += 8;

   466             count -= 8;

   468         } while (count >= 8);

   469     }

   471     // Leftovers

   472     if (count > 0) {

   473         int scale = SkAlpha255To256(alpha);

   474         DITHER_565_SCAN(y);

   475         do {

   476             SkPMColor c = *src++;

   477             SkPMColorAssert(c);

   479             int dither = DITHER_VALUE(x);

   480             int sr = SkGetPackedR32(c);

   481             int sg = SkGetPackedG32(c);

   482             int sb = SkGetPackedB32(c);

   483             sr = SkDITHER_R32To565(sr, dither);

   484             sg = SkDITHER_G32To565(sg, dither);

   485             sb = SkDITHER_B32To565(sb, dither);

   487             uint16_t d = *dst;

   488             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),

   489                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),

   490                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));

   491             DITHER_INC_X(x);

   492         } while (--count != 0);

   493     }

   494 }

   496 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

   497                                 const SkPMColor* SK_RESTRICT src,

   498                                 int count, U8CPU alpha) {

   500     SkASSERT(255 == alpha);

   501     if (count > 0) {

   504     uint8x8_t alpha_mask;

   506     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

   507     alpha_mask = vld1_u8(alpha_mask_setup);

   509     /* do the NEON unrolled code */

   510 #define    UNROLL    4

   511     while (count >= UNROLL) {

   512         uint8x8_t src_raw, dst_raw, dst_final;

   513         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

   515         /* The two prefetches below may make the code slighlty

   516          * slower for small values of count but are worth having

   517          * in the general case.

   518          */

   519         __builtin_prefetch(src+32);

   520         __builtin_prefetch(dst+32);

   522         /* get the source */

   523         src_raw = vreinterpret_u8_u32(vld1_u32(src));

   524 #if    UNROLL > 2

   525         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

   526 #endif

   528         /* get and hold the dst too */

   529         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

   530 #if    UNROLL > 2

   531         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

   532 #endif

   534     /* 1st and 2nd bits of the unrolling */

   535     {

   536         uint8x8_t dst_cooked;

   537         uint16x8_t dst_wide;

   538         uint8x8_t alpha_narrow;

   539         uint16x8_t alpha_wide;

   541         /* get the alphas spread out properly */

   542         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

   543         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

   545         /* spread the dest */

   546         dst_wide = vmovl_u8(dst_raw);

   548         /* alpha mul the dest */

   549         dst_wide = vmulq_u16 (dst_wide, alpha_wide);

   550         dst_cooked = vshrn_n_u16(dst_wide, 8);

   552         /* sum -- ignoring any byte lane overflows */

   553         dst_final = vadd_u8(src_raw, dst_cooked);

   554     }

   556 #if    UNROLL > 2

   557     /* the 3rd and 4th bits of our unrolling */

   558     {

   559         uint8x8_t dst_cooked;

   560         uint16x8_t dst_wide;

   561         uint8x8_t alpha_narrow;

   562         uint16x8_t alpha_wide;

   564         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

   565         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

   567         /* spread the dest */

   568         dst_wide = vmovl_u8(dst_raw_2);

   570         /* alpha mul the dest */

   571         dst_wide = vmulq_u16 (dst_wide, alpha_wide);

   572         dst_cooked = vshrn_n_u16(dst_wide, 8);

   574         /* sum -- ignoring any byte lane overflows */

   575         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

   576     }

   577 #endif

   579         vst1_u32(dst, vreinterpret_u32_u8(dst_final));

   580 #if    UNROLL > 2

   581         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

   582 #endif

   584         src += UNROLL;

   585         dst += UNROLL;

   586         count -= UNROLL;

   587     }

   588 #undef    UNROLL

   590     /* do any residual iterations */

   591         while (--count >= 0) {

   592             *dst = SkPMSrcOver(*src, *dst);

   593             src += 1;

   594             dst += 1;

   595         }

   596     }

   597 }

   599 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,

   600                                 const SkPMColor* SK_RESTRICT src,

   601                                 int count, U8CPU alpha) {

   602     SkASSERT(255 == alpha);

   604     if (count <= 0)

   605     return;

   607     /* Use these to check if src is transparent or opaque */

   608     const unsigned int ALPHA_OPAQ  = 0xFF000000;

   609     const unsigned int ALPHA_TRANS = 0x00FFFFFF;

   611 #define UNROLL  4

   612     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);

   613     const SkPMColor* SK_RESTRICT src_temp = src;

   615     /* set up the NEON variables */

   616     uint8x8_t alpha_mask;

   617     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

   618     alpha_mask = vld1_u8(alpha_mask_setup);

   620     uint8x8_t src_raw, dst_raw, dst_final;

   621     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;

   622     uint8x8_t dst_cooked;

   623     uint16x8_t dst_wide;

   624     uint8x8_t alpha_narrow;

   625     uint16x8_t alpha_wide;

   627     /* choose the first processing type */

   628     if( src >= src_end)

   629         goto TAIL;

   630     if(*src <= ALPHA_TRANS)

   631         goto ALPHA_0;

   632     if(*src >= ALPHA_OPAQ)

   633         goto ALPHA_255;

   634     /* fall-thru */

   636 ALPHA_1_TO_254:

   637     do {

   639         /* get the source */

   640         src_raw = vreinterpret_u8_u32(vld1_u32(src));

   641         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));

   643         /* get and hold the dst too */

   644         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));

   645         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));

   648         /* get the alphas spread out properly */

   649         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);

   650         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

   651         /* we collapsed (255-a)+1 ... */

   652         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

   654         /* spread the dest */

   655         dst_wide = vmovl_u8(dst_raw);

   657         /* alpha mul the dest */

   658         dst_wide = vmulq_u16 (dst_wide, alpha_wide);

   659         dst_cooked = vshrn_n_u16(dst_wide, 8);

   661         /* sum -- ignoring any byte lane overflows */

   662         dst_final = vadd_u8(src_raw, dst_cooked);

   664         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);

   665         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */

   666         /* we collapsed (255-a)+1 ... */

   667         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);

   669         /* spread the dest */

   670         dst_wide = vmovl_u8(dst_raw_2);

   672         /* alpha mul the dest */

   673         dst_wide = vmulq_u16 (dst_wide, alpha_wide);

   674         dst_cooked = vshrn_n_u16(dst_wide, 8);

   676         /* sum -- ignoring any byte lane overflows */

   677         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);

   679         vst1_u32(dst, vreinterpret_u32_u8(dst_final));

   680         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));

   682         src += UNROLL;

   683         dst += UNROLL;

   685         /* if 2 of the next pixels aren't between 1 and 254

   686         it might make sense to go to the optimized loops */

   687         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))

   688             break;

   690     } while(src < src_end);

   692     if (src >= src_end)

   693         goto TAIL;

   695     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)

   696         goto ALPHA_255;

   698     /*fall-thru*/

   700 ALPHA_0:

   702     /*In this state, we know the current alpha is 0 and

   703      we optimize for the next alpha also being zero. */

   704     src_temp = src;  //so we don't have to increment dst every time

   705     do {

   706         if(*(++src) > ALPHA_TRANS)

   707             break;

   708         if(*(++src) > ALPHA_TRANS)

   709             break;

   710         if(*(++src) > ALPHA_TRANS)

   711             break;

   712         if(*(++src) > ALPHA_TRANS)

   713             break;

   714     } while(src < src_end);

   716     dst += (src - src_temp);

   718     /* no longer alpha 0, so determine where to go next. */

   719     if( src >= src_end)

   720         goto TAIL;

   721     if(*src >= ALPHA_OPAQ)

   722         goto ALPHA_255;

   723     else

   724         goto ALPHA_1_TO_254;

   726 ALPHA_255:

   727     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {

   728         dst[0]=src[0];

   729         dst[1]=src[1];

   730         dst[2]=src[2];

   731         dst[3]=src[3];

   732         src+=UNROLL;

   733         dst+=UNROLL;

   734         if(src >= src_end)

   735             goto TAIL;

   736     }

   738     //Handle remainder.

   739     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;

   740         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;

   741             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }

   742         }

   743     }

   745     if( src >= src_end)

   746         goto TAIL;

   747     if(*src <= ALPHA_TRANS)

   748         goto ALPHA_0;

   749     else

   750         goto ALPHA_1_TO_254;

   752 TAIL:

   753     /* do any residual iterations */

   754     src_end += UNROLL + 1;  //goto the real end

   755     while(src != src_end) {

   756         if( *src != 0 ) {

   757             if( *src >= ALPHA_OPAQ ) {

   758                 *dst = *src;

   759             }

   760             else {

   761                 *dst = SkPMSrcOver(*src, *dst);

   762             }

   763         }

   764         src++;

   765         dst++;

   766     }

   768 #undef    UNROLL

   769     return;

   770 }

   772 /* Neon version of S32_Blend_BlitRow32()

   773  * portable version is in src/core/SkBlitRow_D32.cpp

   774  */

   775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

   776                               const SkPMColor* SK_RESTRICT src,

   777                               int count, U8CPU alpha) {

   778     SkASSERT(alpha <= 255);

   780     if (count <= 0) {

   781         return;

   782     }

   784     uint16_t src_scale = SkAlpha255To256(alpha);

   785     uint16_t dst_scale = 256 - src_scale;

   787     while (count >= 2) {

   788         uint8x8_t vsrc, vdst, vres;

   789         uint16x8_t vsrc_wide, vdst_wide;

   791         /* These commented prefetches are a big win for count

   792          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.

   793          * They also hurt a little (<5%) on an A15

   794          */

   795         //__builtin_prefetch(src+32);

   796         //__builtin_prefetch(dst+32);

   798         // Load

   799         vsrc = vreinterpret_u8_u32(vld1_u32(src));

   800         vdst = vreinterpret_u8_u32(vld1_u32(dst));

   802         // Process src

   803         vsrc_wide = vmovl_u8(vsrc);

   804         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

   806         // Process dst

   807         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

   809         // Combine

   810         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

   812         // Store

   813         vst1_u32(dst, vreinterpret_u32_u8(vres));

   815         src += 2;

   816         dst += 2;

   817         count -= 2;

   818     }

   820     if (count == 1) {

   821         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;

   822         uint16x8_t vsrc_wide, vdst_wide;

   824         // Load

   825         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));

   826         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));

   828         // Process

   829         vsrc_wide = vmovl_u8(vsrc);

   830         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));

   831         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));

   832         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

   834         // Store

   835         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

   836     }

   837 }

   839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,

   840                          const SkPMColor* SK_RESTRICT src,

   841                          int count, U8CPU alpha) {

   843     SkASSERT(255 >= alpha);

   845     if (count <= 0) {

   846         return;

   847     }

   849     unsigned alpha256 = SkAlpha255To256(alpha);

   851     // First deal with odd counts

   852     if (count & 1) {

   853         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;

   854         uint16x8_t vdst_wide, vsrc_wide;

   855         unsigned dst_scale;

   857         // Load

   858         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));

   859         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));

   861         // Calc dst_scale

   862         dst_scale = vget_lane_u8(vsrc, 3);

   863         dst_scale *= alpha256;

   864         dst_scale >>= 8;

   865         dst_scale = 256 - dst_scale;

   867         // Process src

   868         vsrc_wide = vmovl_u8(vsrc);

   869         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);

   871         // Process dst

   872         vdst_wide = vmovl_u8(vdst);

   873         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);

   875         // Combine

   876         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

   878         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);

   879         dst++;

   880         src++;

   881         count--;

   882     }

   884     if (count) {

   885         uint8x8_t alpha_mask;

   886         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};

   887         alpha_mask = vld1_u8(alpha_mask_setup);

   889         do {

   891             uint8x8_t vsrc, vdst, vres, vsrc_alphas;

   892             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;

   894             __builtin_prefetch(src+32);

   895             __builtin_prefetch(dst+32);

   897             // Load

   898             vsrc = vreinterpret_u8_u32(vld1_u32(src));

   899             vdst = vreinterpret_u8_u32(vld1_u32(dst));

   901             // Prepare src_scale

   902             vsrc_scale = vdupq_n_u16(alpha256);

   904             // Calc dst_scale

   905             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);

   906             vdst_scale = vmovl_u8(vsrc_alphas);

   907             vdst_scale *= vsrc_scale;

   908             vdst_scale = vshrq_n_u16(vdst_scale, 8);

   909             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);

   911             // Process src

   912             vsrc_wide = vmovl_u8(vsrc);

   913             vsrc_wide *= vsrc_scale;

   915             // Process dst

   916             vdst_wide = vmovl_u8(vdst);

   917             vdst_wide *= vdst_scale;

   919             // Combine

   920             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);

   922             vst1_u32(dst, vreinterpret_u32_u8(vres));

   924             src += 2;

   925             dst += 2;

   926             count -= 2;

   927         } while(count);

   928     }

   929 }

   931 ///////////////////////////////////////////////////////////////////////////////

   933 #undef    DEBUG_OPAQUE_DITHER

   935 #if    defined(DEBUG_OPAQUE_DITHER)

   936 static void showme8(char *str, void *p, int len)

   937 {

   938     static char buf[256];

   939     char tbuf[32];

   940     int i;

   941     char *pc = (char*) p;

   942     sprintf(buf,"%8s:", str);

   943     for(i=0;i<len;i++) {

   944         sprintf(tbuf, "   %02x", pc[i]);

   945         strcat(buf, tbuf);

   946     }

   947     SkDebugf("%s\n", buf);

   948 }

   949 static void showme16(char *str, void *p, int len)

   950 {

   951     static char buf[256];

   952     char tbuf[32];

   953     int i;

   954     uint16_t *pc = (uint16_t*) p;

   955     sprintf(buf,"%8s:", str);

   956     len = (len / sizeof(uint16_t));    /* passed as bytes */

   957     for(i=0;i<len;i++) {

   958         sprintf(tbuf, " %04x", pc[i]);

   959         strcat(buf, tbuf);

   960     }

   961     SkDebugf("%s\n", buf);

   962 }

   963 #endif

   965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,

   966                                    const SkPMColor* SK_RESTRICT src,

   967                                    int count, U8CPU alpha, int x, int y) {

   968     SkASSERT(255 == alpha);

   970 #define    UNROLL    8

   972     if (count >= UNROLL) {

   974 #if defined(DEBUG_OPAQUE_DITHER)

   975     uint16_t tmpbuf[UNROLL];

   976     int td[UNROLL];

   977     int tdv[UNROLL];

   978     int ta[UNROLL];

   979     int tap[UNROLL];

   980     uint16_t in_dst[UNROLL];

   981     int offset = 0;

   982     int noisy = 0;

   983 #endif

   985     uint8x8_t dbase;

   986     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];

   987     dbase = vld1_u8(dstart);

   989         do {

   990         uint8x8_t sr, sg, sb, sa, d;

   991         uint16x8_t dst8, scale8, alpha8;

   992         uint16x8_t dst_r, dst_g, dst_b;

   994 #if defined(DEBUG_OPAQUE_DITHER)

   995         // calculate 8 elements worth into a temp buffer

   996         {

   997         int my_y = y;

   998         int my_x = x;

   999         SkPMColor* my_src = (SkPMColor*)src;

  1000         uint16_t* my_dst = dst;

  1001         int i;

  1003         DITHER_565_SCAN(my_y);

  1004         for(i = 0; i < UNROLL; i++) {

  1005             SkPMColor c = *my_src++;

  1006             SkPMColorAssert(c);

  1007             if (c) {

  1008                 unsigned a = SkGetPackedA32(c);

  1010                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));

  1011                 tdv[i] = DITHER_VALUE(my_x);

  1012                 ta[i] = a;

  1013                 tap[i] = SkAlpha255To256(a);

  1014                 td[i] = d;

  1016                 unsigned sr = SkGetPackedR32(c);

  1017                 unsigned sg = SkGetPackedG32(c);

  1018                 unsigned sb = SkGetPackedB32(c);

  1019                 sr = SkDITHER_R32_FOR_565(sr, d);

  1020                 sg = SkDITHER_G32_FOR_565(sg, d);

  1021                 sb = SkDITHER_B32_FOR_565(sb, d);

  1023                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);

  1024                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);

  1025                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

  1026                 // now src and dst expanded are in g:11 r:10 x:1 b:10

  1027                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

  1028                 td[i] = d;

  1029             } else {

  1030                 tmpbuf[i] = *my_dst;

  1031                 ta[i] = tdv[i] = td[i] = 0xbeef;

  1032             }

  1033             in_dst[i] = *my_dst;

  1034             my_dst += 1;

  1035             DITHER_INC_X(my_x);

  1036         }

  1037         }

  1038 #endif

  1041         {

  1042         register uint8x8_t d0 asm("d0");

  1043         register uint8x8_t d1 asm("d1");

  1044         register uint8x8_t d2 asm("d2");

  1045         register uint8x8_t d3 asm("d3");

  1047         asm ("vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"

  1048             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)

  1049             :

  1050         );

  1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

  1052             sr = d2; sg = d1; sb = d0; sa = d3;

  1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

  1054             sr = d0; sg = d1; sb = d2; sa = d3;

  1055 #endif

  1056         }

  1058         /* calculate 'd', which will be 0..7

  1059          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice

  1060          */

  1061         alpha8 = vmovl_u8(dbase);

  1062         alpha8 = vmlal_u8(alpha8, sa, dbase);

  1063         d = vshrn_n_u16(alpha8, 8);    // narrowing too

  1065         // sr = sr - (sr>>5) + d

  1066         /* watching for 8-bit overflow.  d is 0..7; risky range of

  1067          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';

  1068          * safe  as long as we do ((sr-sr>>5) + d)

  1069          */

  1070         sr = vsub_u8(sr, vshr_n_u8(sr, 5));

  1071         sr = vadd_u8(sr, d);

  1073         // sb = sb - (sb>>5) + d

  1074         sb = vsub_u8(sb, vshr_n_u8(sb, 5));

  1075         sb = vadd_u8(sb, d);

  1077         // sg = sg - (sg>>6) + d>>1; similar logic for overflows

  1078         sg = vsub_u8(sg, vshr_n_u8(sg, 6));

  1079         sg = vadd_u8(sg, vshr_n_u8(d,1));

  1081         // need to pick up 8 dst's -- at 16 bits each, 128 bits

  1082         dst8 = vld1q_u16(dst);

  1083         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));

  1084         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);

  1085         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits

  1087         // blend

  1088         scale8 = vsubw_u8(vdupq_n_u16(256), sa);

  1090         // combine the addq and mul, save 3 insns

  1091         scale8 = vshrq_n_u16(scale8, 3);

  1092         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);

  1093         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);

  1094         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);

  1096         // repack to store

  1097         dst8 = vshrq_n_u16(dst_b, 5);

  1098         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);

  1099         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);

  1101         vst1q_u16(dst, dst8);

  1103 #if defined(DEBUG_OPAQUE_DITHER)

  1104         // verify my 8 elements match the temp buffer

  1105         {

  1106         int i, bad=0;

  1107         static int invocation;

  1109         for (i = 0; i < UNROLL; i++) {

  1110             if (tmpbuf[i] != dst[i]) {

  1111                 bad=1;

  1112             }

  1113         }

  1114         if (bad) {

  1115             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",

  1116                      invocation, offset);

  1117             SkDebugf("  alpha 0x%x\n", alpha);

  1118             for (i = 0; i < UNROLL; i++)

  1119                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",

  1120                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],

  1121                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);

  1123             showme16("alpha8", &alpha8, sizeof(alpha8));

  1124             showme16("scale8", &scale8, sizeof(scale8));

  1125             showme8("d", &d, sizeof(d));

  1126             showme16("dst8", &dst8, sizeof(dst8));

  1127             showme16("dst_b", &dst_b, sizeof(dst_b));

  1128             showme16("dst_g", &dst_g, sizeof(dst_g));

  1129             showme16("dst_r", &dst_r, sizeof(dst_r));

  1130             showme8("sb", &sb, sizeof(sb));

  1131             showme8("sg", &sg, sizeof(sg));

  1132             showme8("sr", &sr, sizeof(sr));

  1134             return;

  1135         }

  1136         offset += UNROLL;

  1137         invocation++;

  1138         }

  1139 #endif

  1140         dst += UNROLL;

  1141         count -= UNROLL;

  1142         // skip x += UNROLL, since it's unchanged mod-4

  1143         } while (count >= UNROLL);

  1144     }

  1145 #undef    UNROLL

  1147     // residuals

  1148     if (count > 0) {

  1149         DITHER_565_SCAN(y);

  1150         do {

  1151             SkPMColor c = *src++;

  1152             SkPMColorAssert(c);

  1153             if (c) {

  1154                 unsigned a = SkGetPackedA32(c);

  1156                 // dither and alpha are just temporary variables to work-around

  1157                 // an ICE in debug.

  1158                 unsigned dither = DITHER_VALUE(x);

  1159                 unsigned alpha = SkAlpha255To256(a);

  1160                 int d = SkAlphaMul(dither, alpha);

  1162                 unsigned sr = SkGetPackedR32(c);

  1163                 unsigned sg = SkGetPackedG32(c);

  1164                 unsigned sb = SkGetPackedB32(c);

  1165                 sr = SkDITHER_R32_FOR_565(sr, d);

  1166                 sg = SkDITHER_G32_FOR_565(sg, d);

  1167                 sb = SkDITHER_B32_FOR_565(sb, d);

  1169                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);

  1170                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);

  1171                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);

  1172                 // now src and dst expanded are in g:11 r:10 x:1 b:10

  1173                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);

  1174             }

  1175             dst += 1;

  1176             DITHER_INC_X(x);

  1177         } while (--count != 0);

  1178     }

  1179 }

  1181 ///////////////////////////////////////////////////////////////////////////////

  1183 #undef    DEBUG_S32_OPAQUE_DITHER

  1185 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,

  1186                                  const SkPMColor* SK_RESTRICT src,

  1187                                  int count, U8CPU alpha, int x, int y) {

  1188     SkASSERT(255 == alpha);

  1190 #define    UNROLL    8

  1191     if (count >= UNROLL) {

  1192     uint8x8_t d;

  1193     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];

  1194     d = vld1_u8(dstart);

  1196     while (count >= UNROLL) {

  1197         uint8x8_t sr, sg, sb;

  1198         uint16x8_t dr, dg, db;

  1199         uint16x8_t dst8;

  1201         {

  1202         register uint8x8_t d0 asm("d0");

  1203         register uint8x8_t d1 asm("d1");

  1204         register uint8x8_t d2 asm("d2");

  1205         register uint8x8_t d3 asm("d3");

  1207         asm (

  1208             "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"

  1209             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)

  1210             :

  1211         );

  1212         sg = d1;

  1213 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)

  1214         sr = d2; sb = d0;

  1215 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)

  1216         sr = d0; sb = d2;

  1217 #endif

  1218         }

  1219         /* XXX: if we want to prefetch, hide it in the above asm()

  1220          * using the gcc __builtin_prefetch(), the prefetch will

  1221          * fall to the bottom of the loop -- it won't stick up

  1222          * at the top of the loop, just after the vld4.

  1223          */

  1225         // sr = sr - (sr>>5) + d

  1226         sr = vsub_u8(sr, vshr_n_u8(sr, 5));

  1227         dr = vaddl_u8(sr, d);

  1229         // sb = sb - (sb>>5) + d

  1230         sb = vsub_u8(sb, vshr_n_u8(sb, 5));

  1231         db = vaddl_u8(sb, d);

  1233         // sg = sg - (sg>>6) + d>>1; similar logic for overflows

  1234         sg = vsub_u8(sg, vshr_n_u8(sg, 6));

  1235         dg = vaddl_u8(sg, vshr_n_u8(d, 1));

  1237         // pack high bits of each into 565 format  (rgb, b is lsb)

  1238         dst8 = vshrq_n_u16(db, 3);

  1239         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);

  1240         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);

  1242         // store it

  1243         vst1q_u16(dst, dst8);

  1245 #if    defined(DEBUG_S32_OPAQUE_DITHER)

  1246         // always good to know if we generated good results

  1247         {

  1248         int i, myx = x, myy = y;

  1249         DITHER_565_SCAN(myy);

  1250         for (i=0;i<UNROLL;i++) {

  1251             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.

  1252             SkPMColor c = src[i-8];

  1253             unsigned dither = DITHER_VALUE(myx);

  1254             uint16_t val = SkDitherRGB32To565(c, dither);

  1255             if (val != dst[i]) {

  1256             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",

  1257                 c, dither, val, dst[i], dstart[i]);

  1258             }

  1259             DITHER_INC_X(myx);

  1260         }

  1261         }

  1262 #endif

  1264         dst += UNROLL;

  1265         // we don't need to increment src as the asm above has already done it

  1266         count -= UNROLL;

  1267         x += UNROLL;        // probably superfluous

  1268     }

  1269     }

  1270 #undef    UNROLL

  1272     // residuals

  1273     if (count > 0) {

  1274         DITHER_565_SCAN(y);

  1275         do {

  1276             SkPMColor c = *src++;

  1277             SkPMColorAssert(c);

  1278             SkASSERT(SkGetPackedA32(c) == 255);

  1280             unsigned dither = DITHER_VALUE(x);

  1281             *dst++ = SkDitherRGB32To565(c, dither);

  1282             DITHER_INC_X(x);

  1283         } while (--count != 0);

  1284     }

  1285 }

  1287 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,

  1288                       SkPMColor color) {

  1289     if (count <= 0) {

  1290         return;

  1291     }

  1293     if (0 == color) {

  1294         if (src != dst) {

  1295             memcpy(dst, src, count * sizeof(SkPMColor));

  1296         }

  1297         return;

  1298     }

  1300     unsigned colorA = SkGetPackedA32(color);

  1301     if (255 == colorA) {

  1302         sk_memset32(dst, color, count);

  1303     } else {

  1304         unsigned scale = 256 - SkAlpha255To256(colorA);

  1306         if (count >= 8) {

  1307             // at the end of this assembly, count will have been decremented

  1308             // to a negative value. That is, if count mod 8 = x, it will be

  1309             // -8 +x coming out.

  1310             asm volatile (

  1311                 PLD128(src, 0)

  1313                 "vdup.32    q0, %[color]                \n\t"

  1315                 PLD128(src, 128)

  1317                 // scale numerical interval [0-255], so load as 8 bits

  1318                 "vdup.8     d2, %[scale]                \n\t"

  1320                 PLD128(src, 256)

  1322                 "subs       %[count], %[count], #8      \n\t"

  1324                 PLD128(src, 384)

  1326                 "Loop_Color32:                          \n\t"

  1328                 // load src color, 8 pixels, 4 64 bit registers

  1329                 // (and increment src).

  1330                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"

  1332                 PLD128(src, 384)

  1334                 // multiply long by scale, 64 bits at a time,

  1335                 // destination into a 128 bit register.

  1336                 "vmull.u8   q4, d4, d2                  \n\t"

  1337                 "vmull.u8   q5, d5, d2                  \n\t"

  1338                 "vmull.u8   q6, d6, d2                  \n\t"

  1339                 "vmull.u8   q7, d7, d2                  \n\t"

  1341                 // shift the 128 bit registers, containing the 16

  1342                 // bit scaled values back to 8 bits, narrowing the

  1343                 // results to 64 bit registers.

  1344                 "vshrn.i16  d8, q4, #8                  \n\t"

  1345                 "vshrn.i16  d9, q5, #8                  \n\t"

  1346                 "vshrn.i16  d10, q6, #8                 \n\t"

  1347                 "vshrn.i16  d11, q7, #8                 \n\t"

  1349                 // adding back the color, using 128 bit registers.

  1350                 "vadd.i8    q6, q4, q0                  \n\t"

  1351                 "vadd.i8    q7, q5, q0                  \n\t"

  1353                 // store back the 8 calculated pixels (2 128 bit

  1354                 // registers), and increment dst.

  1355                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"

  1357                 "subs       %[count], %[count], #8      \n\t"

  1358                 "bge        Loop_Color32                \n\t"

  1359                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)

  1360                 : [color] "r" (color), [scale] "r" (scale)

  1361                 : "cc", "memory",

  1362                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",

  1363                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"

  1364                           );

  1365             // At this point, if we went through the inline assembly, count is

  1366             // a negative value:

  1367             // if the value is -8, there is no pixel left to process.

  1368             // if the value is -7, there is one pixel left to process

  1369             // ...

  1370             // And'ing it with 7 will give us the number of pixels

  1371             // left to process.

  1372             count = count & 0x7;

  1373         }

  1375         while (count > 0) {

  1376             *dst = color + SkAlphaMulQ(*src, scale);

  1377             src += 1;

  1378             dst += 1;

  1379             count--;

  1380         }

  1381     }

  1382 }

  1384 ///////////////////////////////////////////////////////////////////////////////

  1386 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {

  1387     // no dither

  1388     // NOTE: For the S32_D565_Blend function below, we don't have a special

  1389     //       version that assumes that each source pixel is opaque. But our

  1390     //       S32A is still faster than the default, so use it.

  1391     S32_D565_Opaque_neon,

  1392     S32A_D565_Blend_neon,   // really S32_D565_Blend

  1393     S32A_D565_Opaque_neon,

  1394     S32A_D565_Blend_neon,

  1396     // dither

  1397     S32_D565_Opaque_Dither_neon,

  1398     S32_D565_Blend_Dither_neon,

  1399     S32A_D565_Opaque_Dither_neon,

  1400     NULL,   // S32A_D565_Blend_Dither

  1401 };

  1403 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {

  1404     NULL,   // S32_Opaque,

  1405     S32_Blend_BlitRow32_neon,        // S32_Blend,

  1406     /*

  1407      * We have two choices for S32A_Opaque procs. The one reads the src alpha

  1408      * value and attempts to optimize accordingly.  The optimization is

  1409      * sensitive to the source content and is not a win in all cases. For

  1410      * example, if there are a lot of transitions between the alpha states,

  1411      * the performance will almost certainly be worse.  However, for many

  1412      * common cases the performance is equivalent or better than the standard

  1413      * case where we do not inspect the src alpha.

  1414      */

  1415 #if SK_A32_SHIFT == 24

  1416     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor

  1417     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,

  1418 #else

  1419     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,

  1420 #endif

  1421     S32A_Blend_BlitRow32_neon        // S32A_Blend

  1422 };

The Tor Browser / file revision

gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp@129ffea94266

gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp