gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /*
     2  * Copyright 2012 The Android Open Source Project
     3  *
     4  * Use of this source code is governed by a BSD-style license that can be
     5  * found in the LICENSE file.
     6  */
     8 #include "SkBlitRow_opts_arm_neon.h"
    10 #include "SkBlitMask.h"
    11 #include "SkBlitRow.h"
    12 #include "SkColorPriv.h"
    13 #include "SkDither.h"
    14 #include "SkMathPriv.h"
    15 #include "SkUtils.h"
    17 #include "SkCachePreload_arm.h"
    18 #include "SkColor_opts_neon.h"
    19 #include <arm_neon.h>
    21 void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    22                            const SkPMColor* SK_RESTRICT src, int count,
    23                            U8CPU alpha, int /*x*/, int /*y*/) {
    24     SkASSERT(255 == alpha);
    26     while (count >= 8) {
    27         uint8x8x4_t vsrc;
    28         uint16x8_t vdst;
    30         // Load
    31         vsrc = vld4_u8((uint8_t*)src);
    33         // Convert src to 565
    34         vdst = SkPixel32ToPixel16_neon8(vsrc);
    36         // Store
    37         vst1q_u16(dst, vdst);
    39         // Prepare next iteration
    40         dst += 8;
    41         src += 8;
    42         count -= 8;
    43     };
    45     // Leftovers
    46     while (count > 0) {
    47         SkPMColor c = *src++;
    48         SkPMColorAssert(c);
    49         *dst = SkPixel32ToPixel16_ToU16(c);
    50         dst++;
    51         count--;
    52     };
    53 }
    55 void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    56                            const SkPMColor* SK_RESTRICT src, int count,
    57                            U8CPU alpha, int /*x*/, int /*y*/) {
    58     SkASSERT(255 == alpha);
    60     if (count >= 8) {
    61         uint16_t* SK_RESTRICT keep_dst = 0;
    63         asm volatile (
    64                       "ands       ip, %[count], #7            \n\t"
    65                       "vmov.u8    d31, #1<<7                  \n\t"
    66                       "vld1.16    {q12}, [%[dst]]             \n\t"
    67                       "vld4.8     {d0-d3}, [%[src]]           \n\t"
    68                       // Thumb does not support the standard ARM conditional
    69                       // instructions but instead requires the 'it' instruction
    70                       // to signal conditional execution
    71                       "it eq                                  \n\t"
    72                       "moveq      ip, #8                      \n\t"
    73                       "mov        %[keep_dst], %[dst]         \n\t"
    75                       "add        %[src], %[src], ip, LSL#2   \n\t"
    76                       "add        %[dst], %[dst], ip, LSL#1   \n\t"
    77                       "subs       %[count], %[count], ip      \n\t"
    78                       "b          9f                          \n\t"
    79                       // LOOP
    80                       "2:                                         \n\t"
    82                       "vld1.16    {q12}, [%[dst]]!            \n\t"
    83                       "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    84                       "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    85                       "sub        %[keep_dst], %[dst], #8*2   \n\t"
    86                       "subs       %[count], %[count], #8      \n\t"
    87                       "9:                                         \n\t"
    88                       "pld        [%[dst],#32]                \n\t"
    89                       // expand 0565 q12 to 8888 {d4-d7}
    90                       "vmovn.u16  d4, q12                     \n\t"
    91                       "vshr.u16   q11, q12, #5                \n\t"
    92                       "vshr.u16   q10, q12, #6+5              \n\t"
    93                       "vmovn.u16  d5, q11                     \n\t"
    94                       "vmovn.u16  d6, q10                     \n\t"
    95                       "vshl.u8    d4, d4, #3                  \n\t"
    96                       "vshl.u8    d5, d5, #2                  \n\t"
    97                       "vshl.u8    d6, d6, #3                  \n\t"
    99                       "vmovl.u8   q14, d31                    \n\t"
   100                       "vmovl.u8   q13, d31                    \n\t"
   101                       "vmovl.u8   q12, d31                    \n\t"
   103                       // duplicate in 4/2/1 & 8pix vsns
   104                       "vmvn.8     d30, d3                     \n\t"
   105                       "vmlal.u8   q14, d30, d6                \n\t"
   106                       "vmlal.u8   q13, d30, d5                \n\t"
   107                       "vmlal.u8   q12, d30, d4                \n\t"
   108                       "vshr.u16   q8, q14, #5                 \n\t"
   109                       "vshr.u16   q9, q13, #6                 \n\t"
   110                       "vaddhn.u16 d6, q14, q8                 \n\t"
   111                       "vshr.u16   q8, q12, #5                 \n\t"
   112                       "vaddhn.u16 d5, q13, q9                 \n\t"
   113                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
   114                       "vaddhn.u16 d4, q12, q8                 \n\t"
   115                       // intentionally don't calculate alpha
   116                       // result in d4-d6
   118                       "vqadd.u8   d5, d5, d1                  \n\t"
   119                       "vqadd.u8   d4, d4, d2                  \n\t"
   121                       // pack 8888 {d4-d6} to 0565 q10
   122                       "vshll.u8   q10, d6, #8                 \n\t"
   123                       "vshll.u8   q3, d5, #8                  \n\t"
   124                       "vshll.u8   q2, d4, #8                  \n\t"
   125                       "vsri.u16   q10, q3, #5                 \n\t"
   126                       "vsri.u16   q10, q2, #11                \n\t"
   128                       "bne        2b                          \n\t"
   130                       "1:                                         \n\t"
   131                       "vst1.16      {q10}, [%[keep_dst]]      \n\t"
   132                       : [count] "+r" (count)
   133                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
   134                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
   135                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
   136                       "d30","d31"
   137                       );
   138     }
   139     else
   140     {   // handle count < 8
   141         uint16_t* SK_RESTRICT keep_dst = 0;
   143         asm volatile (
   144                       "vmov.u8    d31, #1<<7                  \n\t"
   145                       "mov        %[keep_dst], %[dst]         \n\t"
   147                       "tst        %[count], #4                \n\t"
   148                       "beq        14f                         \n\t"
   149                       "vld1.16    {d25}, [%[dst]]!            \n\t"
   150                       "vld1.32    {q1}, [%[src]]!             \n\t"
   152                       "14:                                        \n\t"
   153                       "tst        %[count], #2                \n\t"
   154                       "beq        12f                         \n\t"
   155                       "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
   156                       "vld1.32    {d1}, [%[src]]!             \n\t"
   158                       "12:                                        \n\t"
   159                       "tst        %[count], #1                \n\t"
   160                       "beq        11f                         \n\t"
   161                       "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
   162                       "vld1.32    {d0[1]}, [%[src]]!          \n\t"
   164                       "11:                                        \n\t"
   165                       // unzips achieve the same as a vld4 operation
   166                       "vuzpq.u16  q0, q1                      \n\t"
   167                       "vuzp.u8    d0, d1                      \n\t"
   168                       "vuzp.u8    d2, d3                      \n\t"
   169                       // expand 0565 q12 to 8888 {d4-d7}
   170                       "vmovn.u16  d4, q12                     \n\t"
   171                       "vshr.u16   q11, q12, #5                \n\t"
   172                       "vshr.u16   q10, q12, #6+5              \n\t"
   173                       "vmovn.u16  d5, q11                     \n\t"
   174                       "vmovn.u16  d6, q10                     \n\t"
   175                       "vshl.u8    d4, d4, #3                  \n\t"
   176                       "vshl.u8    d5, d5, #2                  \n\t"
   177                       "vshl.u8    d6, d6, #3                  \n\t"
   179                       "vmovl.u8   q14, d31                    \n\t"
   180                       "vmovl.u8   q13, d31                    \n\t"
   181                       "vmovl.u8   q12, d31                    \n\t"
   183                       // duplicate in 4/2/1 & 8pix vsns
   184                       "vmvn.8     d30, d3                     \n\t"
   185                       "vmlal.u8   q14, d30, d6                \n\t"
   186                       "vmlal.u8   q13, d30, d5                \n\t"
   187                       "vmlal.u8   q12, d30, d4                \n\t"
   188                       "vshr.u16   q8, q14, #5                 \n\t"
   189                       "vshr.u16   q9, q13, #6                 \n\t"
   190                       "vaddhn.u16 d6, q14, q8                 \n\t"
   191                       "vshr.u16   q8, q12, #5                 \n\t"
   192                       "vaddhn.u16 d5, q13, q9                 \n\t"
   193                       "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
   194                       "vaddhn.u16 d4, q12, q8                 \n\t"
   195                       // intentionally don't calculate alpha
   196                       // result in d4-d6
   198                       "vqadd.u8   d5, d5, d1                  \n\t"
   199                       "vqadd.u8   d4, d4, d2                  \n\t"
   201                       // pack 8888 {d4-d6} to 0565 q10
   202                       "vshll.u8   q10, d6, #8                 \n\t"
   203                       "vshll.u8   q3, d5, #8                  \n\t"
   204                       "vshll.u8   q2, d4, #8                  \n\t"
   205                       "vsri.u16   q10, q3, #5                 \n\t"
   206                       "vsri.u16   q10, q2, #11                \n\t"
   208                       // store
   209                       "tst        %[count], #4                \n\t"
   210                       "beq        24f                         \n\t"
   211                       "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
   213                       "24:                                        \n\t"
   214                       "tst        %[count], #2                \n\t"
   215                       "beq        22f                         \n\t"
   216                       "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
   218                       "22:                                        \n\t"
   219                       "tst        %[count], #1                \n\t"
   220                       "beq        21f                         \n\t"
   221                       "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
   223                       "21:                                        \n\t"
   224                       : [count] "+r" (count)
   225                       : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
   226                       : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
   227                       "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
   228                       "d30","d31"
   229                       );
   230     }
   231 }
   233 static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
   234     prod += vdupq_n_u16(128);
   235     prod += vshrq_n_u16(prod, 8);
   236     return vshrq_n_u16(prod, 8);
   237 }
   239 void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
   240                           const SkPMColor* SK_RESTRICT src, int count,
   241                           U8CPU alpha, int /*x*/, int /*y*/) {
   242    SkASSERT(255 > alpha);
   244     /* This code implements a Neon version of S32A_D565_Blend. The results have
   245      * a few mismatches compared to the original code. These mismatches never
   246      * exceed 1.
   247      */
   249     if (count >= 8) {
   250         uint16x8_t valpha_max, vmask_blue;
   251         uint8x8_t valpha;
   253         // prepare constants
   254         valpha_max = vmovq_n_u16(255);
   255         valpha = vdup_n_u8(alpha);
   256         vmask_blue = vmovq_n_u16(SK_B16_MASK);
   258         do {
   259             uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
   260             uint16x8_t vres_a, vres_r, vres_g, vres_b;
   261             uint8x8x4_t vsrc;
   263             // load pixels
   264             vdst = vld1q_u16(dst);
   265 #if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
   266             asm (
   267                 "vld4.u8 %h[vsrc], [%[src]]!"
   268                 : [vsrc] "=w" (vsrc), [src] "+&r" (src)
   269                 : :
   270             );
   271 #else
   272             register uint8x8_t d0 asm("d0");
   273             register uint8x8_t d1 asm("d1");
   274             register uint8x8_t d2 asm("d2");
   275             register uint8x8_t d3 asm("d3");
   277             asm volatile (
   278                 "vld4.u8    {d0-d3},[%[src]]!;"
   279                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
   280                   [src] "+&r" (src)
   281                 : :
   282             );
   283             vsrc.val[0] = d0;
   284             vsrc.val[1] = d1;
   285             vsrc.val[2] = d2;
   286             vsrc.val[3] = d3;
   287 #endif
   290             // deinterleave dst
   291             vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
   292             vdst_b = vdst & vmask_blue;                     // extract blue
   293             vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
   294             vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
   296             // shift src to 565
   297             vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
   298             vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
   299             vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
   301             // calc src * src_scale
   302             vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
   303             vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
   304             vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
   305             vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
   307             // prepare dst_scale
   308             vres_a = SkDiv255Round_neon8(vres_a);
   309             vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
   311             // add dst * dst_scale to previous result
   312             vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
   313             vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
   314             vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
   316 #ifdef S32A_D565_BLEND_EXACT
   317             // It is possible to get exact results with this but it is slow,
   318             // even slower than C code in some cases
   319             vres_r = SkDiv255Round_neon8(vres_r);
   320             vres_g = SkDiv255Round_neon8(vres_g);
   321             vres_b = SkDiv255Round_neon8(vres_b);
   322 #else
   323             vres_r = vrshrq_n_u16(vres_r, 8);
   324             vres_g = vrshrq_n_u16(vres_g, 8);
   325             vres_b = vrshrq_n_u16(vres_b, 8);
   326 #endif
   327             // pack result
   328             vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
   329             vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
   331             // store
   332             vst1q_u16(dst, vres_b);
   333             dst += 8;
   334             count -= 8;
   335         } while (count >= 8);
   336     }
   338     // leftovers
   339     while (count-- > 0) {
   340         SkPMColor sc = *src++;
   341         if (sc) {
   342             uint16_t dc = *dst;
   343             unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
   344             unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
   345             unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
   346             unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
   347             *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
   348         }
   349         dst += 1;
   350     }
   351 }
   353 /* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
   354  * each dither value is spaced out into byte lanes, and repeated
   355  * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
   356  * start of each row.
   357  */
   358 static const uint8_t gDitherMatrix_Neon[48] = {
   359     0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
   360     6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
   361     1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
   362     7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
   364 };
   366 void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
   367                                 int count, U8CPU alpha, int x, int y)
   368 {
   370     SkASSERT(255 > alpha);
   372     // rescale alpha to range 1 - 256
   373     int scale = SkAlpha255To256(alpha);
   375     if (count >= 8) {
   376         /* select row and offset for dither array */
   377         const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   379         uint8x8_t vdither = vld1_u8(dstart);         // load dither values
   380         uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
   382         int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
   383         uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
   385         do {
   387             uint8x8_t vsrc_r, vsrc_g, vsrc_b;
   388             uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
   389             uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
   390             uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
   391             uint16x8_t vdst;
   392             uint16x8_t vdst_r, vdst_g, vdst_b;
   393             int16x8_t vres_r, vres_g, vres_b;
   394             int8x8_t vres8_r, vres8_g, vres8_b;
   396             // Load source and add dither
   397             {
   398             register uint8x8_t d0 asm("d0");
   399             register uint8x8_t d1 asm("d1");
   400             register uint8x8_t d2 asm("d2");
   401             register uint8x8_t d3 asm("d3");
   403             asm (
   404                 "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   405                 : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   406                 :
   407             );
   408             vsrc_g = d1;
   409 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
   410             vsrc_r = d2; vsrc_b = d0;
   411 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
   412             vsrc_r = d0; vsrc_b = d2;
   413 #endif
   414             }
   416             vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
   417             vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
   418             vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
   420             vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
   421             vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
   422             vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
   424             vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
   425             vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
   426             vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
   428             vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
   429             vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
   430             vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
   432             // Load dst and unpack
   433             vdst = vld1q_u16(dst);
   434             vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
   435             vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
   436             vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
   438             // subtract dst from src and widen
   439             vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
   440             vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
   441             vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
   443             // multiply diffs by scale and shift
   444             vres_r = vmulq_s16(vres_r, vscale);
   445             vres_g = vmulq_s16(vres_g, vscale);
   446             vres_b = vmulq_s16(vres_b, vscale);
   448             vres8_r = vshrn_n_s16(vres_r, 8);
   449             vres8_g = vshrn_n_s16(vres_g, 8);
   450             vres8_b = vshrn_n_s16(vres_b, 8);
   452             // add dst to result
   453             vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
   454             vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
   455             vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
   457             // put result into 565 format
   458             vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
   459             vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
   461             // Store result
   462             vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
   464             // Next iteration
   465             dst += 8;
   466             count -= 8;
   468         } while (count >= 8);
   469     }
   471     // Leftovers
   472     if (count > 0) {
   473         int scale = SkAlpha255To256(alpha);
   474         DITHER_565_SCAN(y);
   475         do {
   476             SkPMColor c = *src++;
   477             SkPMColorAssert(c);
   479             int dither = DITHER_VALUE(x);
   480             int sr = SkGetPackedR32(c);
   481             int sg = SkGetPackedG32(c);
   482             int sb = SkGetPackedB32(c);
   483             sr = SkDITHER_R32To565(sr, dither);
   484             sg = SkDITHER_G32To565(sg, dither);
   485             sb = SkDITHER_B32To565(sb, dither);
   487             uint16_t d = *dst;
   488             *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
   489                                  SkAlphaBlend(sg, SkGetPackedG16(d), scale),
   490                                  SkAlphaBlend(sb, SkGetPackedB16(d), scale));
   491             DITHER_INC_X(x);
   492         } while (--count != 0);
   493     }
   494 }
   496 void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   497                                 const SkPMColor* SK_RESTRICT src,
   498                                 int count, U8CPU alpha) {
   500     SkASSERT(255 == alpha);
   501     if (count > 0) {
   504     uint8x8_t alpha_mask;
   506     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   507     alpha_mask = vld1_u8(alpha_mask_setup);
   509     /* do the NEON unrolled code */
   510 #define    UNROLL    4
   511     while (count >= UNROLL) {
   512         uint8x8_t src_raw, dst_raw, dst_final;
   513         uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
   515         /* The two prefetches below may make the code slighlty
   516          * slower for small values of count but are worth having
   517          * in the general case.
   518          */
   519         __builtin_prefetch(src+32);
   520         __builtin_prefetch(dst+32);
   522         /* get the source */
   523         src_raw = vreinterpret_u8_u32(vld1_u32(src));
   524 #if    UNROLL > 2
   525         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
   526 #endif
   528         /* get and hold the dst too */
   529         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   530 #if    UNROLL > 2
   531         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
   532 #endif
   534     /* 1st and 2nd bits of the unrolling */
   535     {
   536         uint8x8_t dst_cooked;
   537         uint16x8_t dst_wide;
   538         uint8x8_t alpha_narrow;
   539         uint16x8_t alpha_wide;
   541         /* get the alphas spread out properly */
   542         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
   543         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   545         /* spread the dest */
   546         dst_wide = vmovl_u8(dst_raw);
   548         /* alpha mul the dest */
   549         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   550         dst_cooked = vshrn_n_u16(dst_wide, 8);
   552         /* sum -- ignoring any byte lane overflows */
   553         dst_final = vadd_u8(src_raw, dst_cooked);
   554     }
   556 #if    UNROLL > 2
   557     /* the 3rd and 4th bits of our unrolling */
   558     {
   559         uint8x8_t dst_cooked;
   560         uint16x8_t dst_wide;
   561         uint8x8_t alpha_narrow;
   562         uint16x8_t alpha_wide;
   564         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
   565         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   567         /* spread the dest */
   568         dst_wide = vmovl_u8(dst_raw_2);
   570         /* alpha mul the dest */
   571         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   572         dst_cooked = vshrn_n_u16(dst_wide, 8);
   574         /* sum -- ignoring any byte lane overflows */
   575         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
   576     }
   577 #endif
   579         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   580 #if    UNROLL > 2
   581         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
   582 #endif
   584         src += UNROLL;
   585         dst += UNROLL;
   586         count -= UNROLL;
   587     }
   588 #undef    UNROLL
   590     /* do any residual iterations */
   591         while (--count >= 0) {
   592             *dst = SkPMSrcOver(*src, *dst);
   593             src += 1;
   594             dst += 1;
   595         }
   596     }
   597 }
   599 void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
   600                                 const SkPMColor* SK_RESTRICT src,
   601                                 int count, U8CPU alpha) {
   602     SkASSERT(255 == alpha);
   604     if (count <= 0)
   605     return;
   607     /* Use these to check if src is transparent or opaque */
   608     const unsigned int ALPHA_OPAQ  = 0xFF000000;
   609     const unsigned int ALPHA_TRANS = 0x00FFFFFF;
   611 #define UNROLL  4
   612     const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
   613     const SkPMColor* SK_RESTRICT src_temp = src;
   615     /* set up the NEON variables */
   616     uint8x8_t alpha_mask;
   617     static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   618     alpha_mask = vld1_u8(alpha_mask_setup);
   620     uint8x8_t src_raw, dst_raw, dst_final;
   621     uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
   622     uint8x8_t dst_cooked;
   623     uint16x8_t dst_wide;
   624     uint8x8_t alpha_narrow;
   625     uint16x8_t alpha_wide;
   627     /* choose the first processing type */
   628     if( src >= src_end)
   629         goto TAIL;
   630     if(*src <= ALPHA_TRANS)
   631         goto ALPHA_0;
   632     if(*src >= ALPHA_OPAQ)
   633         goto ALPHA_255;
   634     /* fall-thru */
   636 ALPHA_1_TO_254:
   637     do {
   639         /* get the source */
   640         src_raw = vreinterpret_u8_u32(vld1_u32(src));
   641         src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
   643         /* get and hold the dst too */
   644         dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   645         dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
   648         /* get the alphas spread out properly */
   649         alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
   650         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   651         /* we collapsed (255-a)+1 ... */
   652         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   654         /* spread the dest */
   655         dst_wide = vmovl_u8(dst_raw);
   657         /* alpha mul the dest */
   658         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   659         dst_cooked = vshrn_n_u16(dst_wide, 8);
   661         /* sum -- ignoring any byte lane overflows */
   662         dst_final = vadd_u8(src_raw, dst_cooked);
   664         alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
   665         /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   666         /* we collapsed (255-a)+1 ... */
   667         alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   669         /* spread the dest */
   670         dst_wide = vmovl_u8(dst_raw_2);
   672         /* alpha mul the dest */
   673         dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   674         dst_cooked = vshrn_n_u16(dst_wide, 8);
   676         /* sum -- ignoring any byte lane overflows */
   677         dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
   679         vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   680         vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
   682         src += UNROLL;
   683         dst += UNROLL;
   685         /* if 2 of the next pixels aren't between 1 and 254
   686         it might make sense to go to the optimized loops */
   687         if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
   688             break;
   690     } while(src < src_end);
   692     if (src >= src_end)
   693         goto TAIL;
   695     if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
   696         goto ALPHA_255;
   698     /*fall-thru*/
   700 ALPHA_0:
   702     /*In this state, we know the current alpha is 0 and
   703      we optimize for the next alpha also being zero. */
   704     src_temp = src;  //so we don't have to increment dst every time
   705     do {
   706         if(*(++src) > ALPHA_TRANS)
   707             break;
   708         if(*(++src) > ALPHA_TRANS)
   709             break;
   710         if(*(++src) > ALPHA_TRANS)
   711             break;
   712         if(*(++src) > ALPHA_TRANS)
   713             break;
   714     } while(src < src_end);
   716     dst += (src - src_temp);
   718     /* no longer alpha 0, so determine where to go next. */
   719     if( src >= src_end)
   720         goto TAIL;
   721     if(*src >= ALPHA_OPAQ)
   722         goto ALPHA_255;
   723     else
   724         goto ALPHA_1_TO_254;
   726 ALPHA_255:
   727     while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
   728         dst[0]=src[0];
   729         dst[1]=src[1];
   730         dst[2]=src[2];
   731         dst[3]=src[3];
   732         src+=UNROLL;
   733         dst+=UNROLL;
   734         if(src >= src_end)
   735             goto TAIL;
   736     }
   738     //Handle remainder.
   739     if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   740         if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   741             if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
   742         }
   743     }
   745     if( src >= src_end)
   746         goto TAIL;
   747     if(*src <= ALPHA_TRANS)
   748         goto ALPHA_0;
   749     else
   750         goto ALPHA_1_TO_254;
   752 TAIL:
   753     /* do any residual iterations */
   754     src_end += UNROLL + 1;  //goto the real end
   755     while(src != src_end) {
   756         if( *src != 0 ) {
   757             if( *src >= ALPHA_OPAQ ) {
   758                 *dst = *src;
   759             }
   760             else {
   761                 *dst = SkPMSrcOver(*src, *dst);
   762             }
   763         }
   764         src++;
   765         dst++;
   766     }
   768 #undef    UNROLL
   769     return;
   770 }
   772 /* Neon version of S32_Blend_BlitRow32()
   773  * portable version is in src/core/SkBlitRow_D32.cpp
   774  */
   775 void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   776                               const SkPMColor* SK_RESTRICT src,
   777                               int count, U8CPU alpha) {
   778     SkASSERT(alpha <= 255);
   780     if (count <= 0) {
   781         return;
   782     }
   784     uint16_t src_scale = SkAlpha255To256(alpha);
   785     uint16_t dst_scale = 256 - src_scale;
   787     while (count >= 2) {
   788         uint8x8_t vsrc, vdst, vres;
   789         uint16x8_t vsrc_wide, vdst_wide;
   791         /* These commented prefetches are a big win for count
   792          * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
   793          * They also hurt a little (<5%) on an A15
   794          */
   795         //__builtin_prefetch(src+32);
   796         //__builtin_prefetch(dst+32);
   798         // Load
   799         vsrc = vreinterpret_u8_u32(vld1_u32(src));
   800         vdst = vreinterpret_u8_u32(vld1_u32(dst));
   802         // Process src
   803         vsrc_wide = vmovl_u8(vsrc);
   804         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   806         // Process dst
   807         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   809         // Combine
   810         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   812         // Store
   813         vst1_u32(dst, vreinterpret_u32_u8(vres));
   815         src += 2;
   816         dst += 2;
   817         count -= 2;
   818     }
   820     if (count == 1) {
   821         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   822         uint16x8_t vsrc_wide, vdst_wide;
   824         // Load
   825         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   826         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   828         // Process
   829         vsrc_wide = vmovl_u8(vsrc);
   830         vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   831         vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   832         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   834         // Store
   835         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   836     }
   837 }
   839 void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   840                          const SkPMColor* SK_RESTRICT src,
   841                          int count, U8CPU alpha) {
   843     SkASSERT(255 >= alpha);
   845     if (count <= 0) {
   846         return;
   847     }
   849     unsigned alpha256 = SkAlpha255To256(alpha);
   851     // First deal with odd counts
   852     if (count & 1) {
   853         uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   854         uint16x8_t vdst_wide, vsrc_wide;
   855         unsigned dst_scale;
   857         // Load
   858         vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   859         vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   861         // Calc dst_scale
   862         dst_scale = vget_lane_u8(vsrc, 3);
   863         dst_scale *= alpha256;
   864         dst_scale >>= 8;
   865         dst_scale = 256 - dst_scale;
   867         // Process src
   868         vsrc_wide = vmovl_u8(vsrc);
   869         vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
   871         // Process dst
   872         vdst_wide = vmovl_u8(vdst);
   873         vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
   875         // Combine
   876         vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   878         vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   879         dst++;
   880         src++;
   881         count--;
   882     }
   884     if (count) {
   885         uint8x8_t alpha_mask;
   886         static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   887         alpha_mask = vld1_u8(alpha_mask_setup);
   889         do {
   891             uint8x8_t vsrc, vdst, vres, vsrc_alphas;
   892             uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
   894             __builtin_prefetch(src+32);
   895             __builtin_prefetch(dst+32);
   897             // Load
   898             vsrc = vreinterpret_u8_u32(vld1_u32(src));
   899             vdst = vreinterpret_u8_u32(vld1_u32(dst));
   901             // Prepare src_scale
   902             vsrc_scale = vdupq_n_u16(alpha256);
   904             // Calc dst_scale
   905             vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   906             vdst_scale = vmovl_u8(vsrc_alphas);
   907             vdst_scale *= vsrc_scale;
   908             vdst_scale = vshrq_n_u16(vdst_scale, 8);
   909             vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
   911             // Process src
   912             vsrc_wide = vmovl_u8(vsrc);
   913             vsrc_wide *= vsrc_scale;
   915             // Process dst
   916             vdst_wide = vmovl_u8(vdst);
   917             vdst_wide *= vdst_scale;
   919             // Combine
   920             vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   922             vst1_u32(dst, vreinterpret_u32_u8(vres));
   924             src += 2;
   925             dst += 2;
   926             count -= 2;
   927         } while(count);
   928     }
   929 }
   931 ///////////////////////////////////////////////////////////////////////////////
   933 #undef    DEBUG_OPAQUE_DITHER
   935 #if    defined(DEBUG_OPAQUE_DITHER)
   936 static void showme8(char *str, void *p, int len)
   937 {
   938     static char buf[256];
   939     char tbuf[32];
   940     int i;
   941     char *pc = (char*) p;
   942     sprintf(buf,"%8s:", str);
   943     for(i=0;i<len;i++) {
   944         sprintf(tbuf, "   %02x", pc[i]);
   945         strcat(buf, tbuf);
   946     }
   947     SkDebugf("%s\n", buf);
   948 }
   949 static void showme16(char *str, void *p, int len)
   950 {
   951     static char buf[256];
   952     char tbuf[32];
   953     int i;
   954     uint16_t *pc = (uint16_t*) p;
   955     sprintf(buf,"%8s:", str);
   956     len = (len / sizeof(uint16_t));    /* passed as bytes */
   957     for(i=0;i<len;i++) {
   958         sprintf(tbuf, " %04x", pc[i]);
   959         strcat(buf, tbuf);
   960     }
   961     SkDebugf("%s\n", buf);
   962 }
   963 #endif
   965 void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   966                                    const SkPMColor* SK_RESTRICT src,
   967                                    int count, U8CPU alpha, int x, int y) {
   968     SkASSERT(255 == alpha);
   970 #define    UNROLL    8
   972     if (count >= UNROLL) {
   974 #if defined(DEBUG_OPAQUE_DITHER)
   975     uint16_t tmpbuf[UNROLL];
   976     int td[UNROLL];
   977     int tdv[UNROLL];
   978     int ta[UNROLL];
   979     int tap[UNROLL];
   980     uint16_t in_dst[UNROLL];
   981     int offset = 0;
   982     int noisy = 0;
   983 #endif
   985     uint8x8_t dbase;
   986     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   987     dbase = vld1_u8(dstart);
   989         do {
   990         uint8x8_t sr, sg, sb, sa, d;
   991         uint16x8_t dst8, scale8, alpha8;
   992         uint16x8_t dst_r, dst_g, dst_b;
   994 #if defined(DEBUG_OPAQUE_DITHER)
   995         // calculate 8 elements worth into a temp buffer
   996         {
   997         int my_y = y;
   998         int my_x = x;
   999         SkPMColor* my_src = (SkPMColor*)src;
  1000         uint16_t* my_dst = dst;
  1001         int i;
  1003         DITHER_565_SCAN(my_y);
  1004         for(i = 0; i < UNROLL; i++) {
  1005             SkPMColor c = *my_src++;
  1006             SkPMColorAssert(c);
  1007             if (c) {
  1008                 unsigned a = SkGetPackedA32(c);
  1010                 int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
  1011                 tdv[i] = DITHER_VALUE(my_x);
  1012                 ta[i] = a;
  1013                 tap[i] = SkAlpha255To256(a);
  1014                 td[i] = d;
  1016                 unsigned sr = SkGetPackedR32(c);
  1017                 unsigned sg = SkGetPackedG32(c);
  1018                 unsigned sb = SkGetPackedB32(c);
  1019                 sr = SkDITHER_R32_FOR_565(sr, d);
  1020                 sg = SkDITHER_G32_FOR_565(sg, d);
  1021                 sb = SkDITHER_B32_FOR_565(sb, d);
  1023                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1024                 uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
  1025                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1026                 // now src and dst expanded are in g:11 r:10 x:1 b:10
  1027                 tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1028                 td[i] = d;
  1029             } else {
  1030                 tmpbuf[i] = *my_dst;
  1031                 ta[i] = tdv[i] = td[i] = 0xbeef;
  1033             in_dst[i] = *my_dst;
  1034             my_dst += 1;
  1035             DITHER_INC_X(my_x);
  1038 #endif
  1042         register uint8x8_t d0 asm("d0");
  1043         register uint8x8_t d1 asm("d1");
  1044         register uint8x8_t d2 asm("d2");
  1045         register uint8x8_t d3 asm("d3");
  1047         asm ("vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
  1048             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
  1050         );
  1051 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
  1052             sr = d2; sg = d1; sb = d0; sa = d3;
  1053 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
  1054             sr = d0; sg = d1; sb = d2; sa = d3;
  1055 #endif
  1058         /* calculate 'd', which will be 0..7
  1059          * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
  1060          */
  1061         alpha8 = vmovl_u8(dbase);
  1062         alpha8 = vmlal_u8(alpha8, sa, dbase);
  1063         d = vshrn_n_u16(alpha8, 8);    // narrowing too
  1065         // sr = sr - (sr>>5) + d
  1066         /* watching for 8-bit overflow.  d is 0..7; risky range of
  1067          * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
  1068          * safe  as long as we do ((sr-sr>>5) + d)
  1069          */
  1070         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
  1071         sr = vadd_u8(sr, d);
  1073         // sb = sb - (sb>>5) + d
  1074         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
  1075         sb = vadd_u8(sb, d);
  1077         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
  1078         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
  1079         sg = vadd_u8(sg, vshr_n_u8(d,1));
  1081         // need to pick up 8 dst's -- at 16 bits each, 128 bits
  1082         dst8 = vld1q_u16(dst);
  1083         dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
  1084         dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
  1085         dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
  1087         // blend
  1088         scale8 = vsubw_u8(vdupq_n_u16(256), sa);
  1090         // combine the addq and mul, save 3 insns
  1091         scale8 = vshrq_n_u16(scale8, 3);
  1092         dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
  1093         dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
  1094         dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
  1096         // repack to store
  1097         dst8 = vshrq_n_u16(dst_b, 5);
  1098         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
  1099         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
  1101         vst1q_u16(dst, dst8);
  1103 #if defined(DEBUG_OPAQUE_DITHER)
  1104         // verify my 8 elements match the temp buffer
  1106         int i, bad=0;
  1107         static int invocation;
  1109         for (i = 0; i < UNROLL; i++) {
  1110             if (tmpbuf[i] != dst[i]) {
  1111                 bad=1;
  1114         if (bad) {
  1115             SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
  1116                      invocation, offset);
  1117             SkDebugf("  alpha 0x%x\n", alpha);
  1118             for (i = 0; i < UNROLL; i++)
  1119                 SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
  1120                          i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
  1121                          in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
  1123             showme16("alpha8", &alpha8, sizeof(alpha8));
  1124             showme16("scale8", &scale8, sizeof(scale8));
  1125             showme8("d", &d, sizeof(d));
  1126             showme16("dst8", &dst8, sizeof(dst8));
  1127             showme16("dst_b", &dst_b, sizeof(dst_b));
  1128             showme16("dst_g", &dst_g, sizeof(dst_g));
  1129             showme16("dst_r", &dst_r, sizeof(dst_r));
  1130             showme8("sb", &sb, sizeof(sb));
  1131             showme8("sg", &sg, sizeof(sg));
  1132             showme8("sr", &sr, sizeof(sr));
  1134             return;
  1136         offset += UNROLL;
  1137         invocation++;
  1139 #endif
  1140         dst += UNROLL;
  1141         count -= UNROLL;
  1142         // skip x += UNROLL, since it's unchanged mod-4
  1143         } while (count >= UNROLL);
  1145 #undef    UNROLL
  1147     // residuals
  1148     if (count > 0) {
  1149         DITHER_565_SCAN(y);
  1150         do {
  1151             SkPMColor c = *src++;
  1152             SkPMColorAssert(c);
  1153             if (c) {
  1154                 unsigned a = SkGetPackedA32(c);
  1156                 // dither and alpha are just temporary variables to work-around
  1157                 // an ICE in debug.
  1158                 unsigned dither = DITHER_VALUE(x);
  1159                 unsigned alpha = SkAlpha255To256(a);
  1160                 int d = SkAlphaMul(dither, alpha);
  1162                 unsigned sr = SkGetPackedR32(c);
  1163                 unsigned sg = SkGetPackedG32(c);
  1164                 unsigned sb = SkGetPackedB32(c);
  1165                 sr = SkDITHER_R32_FOR_565(sr, d);
  1166                 sg = SkDITHER_G32_FOR_565(sg, d);
  1167                 sb = SkDITHER_B32_FOR_565(sb, d);
  1169                 uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1170                 uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  1171                 dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1172                 // now src and dst expanded are in g:11 r:10 x:1 b:10
  1173                 *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1175             dst += 1;
  1176             DITHER_INC_X(x);
  1177         } while (--count != 0);
  1181 ///////////////////////////////////////////////////////////////////////////////
  1183 #undef    DEBUG_S32_OPAQUE_DITHER
  1185 void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
  1186                                  const SkPMColor* SK_RESTRICT src,
  1187                                  int count, U8CPU alpha, int x, int y) {
  1188     SkASSERT(255 == alpha);
  1190 #define    UNROLL    8
  1191     if (count >= UNROLL) {
  1192     uint8x8_t d;
  1193     const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
  1194     d = vld1_u8(dstart);
  1196     while (count >= UNROLL) {
  1197         uint8x8_t sr, sg, sb;
  1198         uint16x8_t dr, dg, db;
  1199         uint16x8_t dst8;
  1202         register uint8x8_t d0 asm("d0");
  1203         register uint8x8_t d1 asm("d1");
  1204         register uint8x8_t d2 asm("d2");
  1205         register uint8x8_t d3 asm("d3");
  1207         asm (
  1208             "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
  1209             : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
  1211         );
  1212         sg = d1;
  1213 #if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
  1214         sr = d2; sb = d0;
  1215 #elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
  1216         sr = d0; sb = d2;
  1217 #endif
  1219         /* XXX: if we want to prefetch, hide it in the above asm()
  1220          * using the gcc __builtin_prefetch(), the prefetch will
  1221          * fall to the bottom of the loop -- it won't stick up
  1222          * at the top of the loop, just after the vld4.
  1223          */
  1225         // sr = sr - (sr>>5) + d
  1226         sr = vsub_u8(sr, vshr_n_u8(sr, 5));
  1227         dr = vaddl_u8(sr, d);
  1229         // sb = sb - (sb>>5) + d
  1230         sb = vsub_u8(sb, vshr_n_u8(sb, 5));
  1231         db = vaddl_u8(sb, d);
  1233         // sg = sg - (sg>>6) + d>>1; similar logic for overflows
  1234         sg = vsub_u8(sg, vshr_n_u8(sg, 6));
  1235         dg = vaddl_u8(sg, vshr_n_u8(d, 1));
  1237         // pack high bits of each into 565 format  (rgb, b is lsb)
  1238         dst8 = vshrq_n_u16(db, 3);
  1239         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
  1240         dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
  1242         // store it
  1243         vst1q_u16(dst, dst8);
  1245 #if    defined(DEBUG_S32_OPAQUE_DITHER)
  1246         // always good to know if we generated good results
  1248         int i, myx = x, myy = y;
  1249         DITHER_565_SCAN(myy);
  1250         for (i=0;i<UNROLL;i++) {
  1251             // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
  1252             SkPMColor c = src[i-8];
  1253             unsigned dither = DITHER_VALUE(myx);
  1254             uint16_t val = SkDitherRGB32To565(c, dither);
  1255             if (val != dst[i]) {
  1256             SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
  1257                 c, dither, val, dst[i], dstart[i]);
  1259             DITHER_INC_X(myx);
  1262 #endif
  1264         dst += UNROLL;
  1265         // we don't need to increment src as the asm above has already done it
  1266         count -= UNROLL;
  1267         x += UNROLL;        // probably superfluous
  1270 #undef    UNROLL
  1272     // residuals
  1273     if (count > 0) {
  1274         DITHER_565_SCAN(y);
  1275         do {
  1276             SkPMColor c = *src++;
  1277             SkPMColorAssert(c);
  1278             SkASSERT(SkGetPackedA32(c) == 255);
  1280             unsigned dither = DITHER_VALUE(x);
  1281             *dst++ = SkDitherRGB32To565(c, dither);
  1282             DITHER_INC_X(x);
  1283         } while (--count != 0);
  1287 void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
  1288                       SkPMColor color) {
  1289     if (count <= 0) {
  1290         return;
  1293     if (0 == color) {
  1294         if (src != dst) {
  1295             memcpy(dst, src, count * sizeof(SkPMColor));
  1297         return;
  1300     unsigned colorA = SkGetPackedA32(color);
  1301     if (255 == colorA) {
  1302         sk_memset32(dst, color, count);
  1303     } else {
  1304         unsigned scale = 256 - SkAlpha255To256(colorA);
  1306         if (count >= 8) {
  1307             // at the end of this assembly, count will have been decremented
  1308             // to a negative value. That is, if count mod 8 = x, it will be
  1309             // -8 +x coming out.
  1310             asm volatile (
  1311                 PLD128(src, 0)
  1313                 "vdup.32    q0, %[color]                \n\t"
  1315                 PLD128(src, 128)
  1317                 // scale numerical interval [0-255], so load as 8 bits
  1318                 "vdup.8     d2, %[scale]                \n\t"
  1320                 PLD128(src, 256)
  1322                 "subs       %[count], %[count], #8      \n\t"
  1324                 PLD128(src, 384)
  1326                 "Loop_Color32:                          \n\t"
  1328                 // load src color, 8 pixels, 4 64 bit registers
  1329                 // (and increment src).
  1330                 "vld1.32    {d4-d7}, [%[src]]!          \n\t"
  1332                 PLD128(src, 384)
  1334                 // multiply long by scale, 64 bits at a time,
  1335                 // destination into a 128 bit register.
  1336                 "vmull.u8   q4, d4, d2                  \n\t"
  1337                 "vmull.u8   q5, d5, d2                  \n\t"
  1338                 "vmull.u8   q6, d6, d2                  \n\t"
  1339                 "vmull.u8   q7, d7, d2                  \n\t"
  1341                 // shift the 128 bit registers, containing the 16
  1342                 // bit scaled values back to 8 bits, narrowing the
  1343                 // results to 64 bit registers.
  1344                 "vshrn.i16  d8, q4, #8                  \n\t"
  1345                 "vshrn.i16  d9, q5, #8                  \n\t"
  1346                 "vshrn.i16  d10, q6, #8                 \n\t"
  1347                 "vshrn.i16  d11, q7, #8                 \n\t"
  1349                 // adding back the color, using 128 bit registers.
  1350                 "vadd.i8    q6, q4, q0                  \n\t"
  1351                 "vadd.i8    q7, q5, q0                  \n\t"
  1353                 // store back the 8 calculated pixels (2 128 bit
  1354                 // registers), and increment dst.
  1355                 "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
  1357                 "subs       %[count], %[count], #8      \n\t"
  1358                 "bge        Loop_Color32                \n\t"
  1359                 : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
  1360                 : [color] "r" (color), [scale] "r" (scale)
  1361                 : "cc", "memory",
  1362                   "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
  1363                   "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
  1364                           );
  1365             // At this point, if we went through the inline assembly, count is
  1366             // a negative value:
  1367             // if the value is -8, there is no pixel left to process.
  1368             // if the value is -7, there is one pixel left to process
  1369             // ...
  1370             // And'ing it with 7 will give us the number of pixels
  1371             // left to process.
  1372             count = count & 0x7;
  1375         while (count > 0) {
  1376             *dst = color + SkAlphaMulQ(*src, scale);
  1377             src += 1;
  1378             dst += 1;
  1379             count--;
  1384 ///////////////////////////////////////////////////////////////////////////////
  1386 const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
  1387     // no dither
  1388     // NOTE: For the S32_D565_Blend function below, we don't have a special
  1389     //       version that assumes that each source pixel is opaque. But our
  1390     //       S32A is still faster than the default, so use it.
  1391     S32_D565_Opaque_neon,
  1392     S32A_D565_Blend_neon,   // really S32_D565_Blend
  1393     S32A_D565_Opaque_neon,
  1394     S32A_D565_Blend_neon,
  1396     // dither
  1397     S32_D565_Opaque_Dither_neon,
  1398     S32_D565_Blend_Dither_neon,
  1399     S32A_D565_Opaque_Dither_neon,
  1400     NULL,   // S32A_D565_Blend_Dither
  1401 };
  1403 const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
  1404     NULL,   // S32_Opaque,
  1405     S32_Blend_BlitRow32_neon,        // S32_Blend,
  1406     /*
  1407      * We have two choices for S32A_Opaque procs. The one reads the src alpha
  1408      * value and attempts to optimize accordingly.  The optimization is
  1409      * sensitive to the source content and is not a win in all cases. For
  1410      * example, if there are a lot of transitions between the alpha states,
  1411      * the performance will almost certainly be worse.  However, for many
  1412      * common cases the performance is equivalent or better than the standard
  1413      * case where we do not inspect the src alpha.
  1414      */
  1415 #if SK_A32_SHIFT == 24
  1416     // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
  1417     S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
  1418 #else
  1419     S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
  1420 #endif
  1421     S32A_Blend_BlitRow32_neon        // S32A_Blend
  1422 };

mercurial