gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,1422 @@
     1.4 +/*
     1.5 + * Copyright 2012 The Android Open Source Project
     1.6 + *
     1.7 + * Use of this source code is governed by a BSD-style license that can be
     1.8 + * found in the LICENSE file.
     1.9 + */
    1.10 +
    1.11 +#include "SkBlitRow_opts_arm_neon.h"
    1.12 +
    1.13 +#include "SkBlitMask.h"
    1.14 +#include "SkBlitRow.h"
    1.15 +#include "SkColorPriv.h"
    1.16 +#include "SkDither.h"
    1.17 +#include "SkMathPriv.h"
    1.18 +#include "SkUtils.h"
    1.19 +
    1.20 +#include "SkCachePreload_arm.h"
    1.21 +#include "SkColor_opts_neon.h"
    1.22 +#include <arm_neon.h>
    1.23 +
    1.24 +void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    1.25 +                           const SkPMColor* SK_RESTRICT src, int count,
    1.26 +                           U8CPU alpha, int /*x*/, int /*y*/) {
    1.27 +    SkASSERT(255 == alpha);
    1.28 +
    1.29 +    while (count >= 8) {
    1.30 +        uint8x8x4_t vsrc;
    1.31 +        uint16x8_t vdst;
    1.32 +
    1.33 +        // Load
    1.34 +        vsrc = vld4_u8((uint8_t*)src);
    1.35 +
    1.36 +        // Convert src to 565
    1.37 +        vdst = SkPixel32ToPixel16_neon8(vsrc);
    1.38 +
    1.39 +        // Store
    1.40 +        vst1q_u16(dst, vdst);
    1.41 +
    1.42 +        // Prepare next iteration
    1.43 +        dst += 8;
    1.44 +        src += 8;
    1.45 +        count -= 8;
    1.46 +    };
    1.47 +
    1.48 +    // Leftovers
    1.49 +    while (count > 0) {
    1.50 +        SkPMColor c = *src++;
    1.51 +        SkPMColorAssert(c);
    1.52 +        *dst = SkPixel32ToPixel16_ToU16(c);
    1.53 +        dst++;
    1.54 +        count--;
    1.55 +    };
    1.56 +}
    1.57 +
    1.58 +void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst,
    1.59 +                           const SkPMColor* SK_RESTRICT src, int count,
    1.60 +                           U8CPU alpha, int /*x*/, int /*y*/) {
    1.61 +    SkASSERT(255 == alpha);
    1.62 +
    1.63 +    if (count >= 8) {
    1.64 +        uint16_t* SK_RESTRICT keep_dst = 0;
    1.65 +
    1.66 +        asm volatile (
    1.67 +                      "ands       ip, %[count], #7            \n\t"
    1.68 +                      "vmov.u8    d31, #1<<7                  \n\t"
    1.69 +                      "vld1.16    {q12}, [%[dst]]             \n\t"
    1.70 +                      "vld4.8     {d0-d3}, [%[src]]           \n\t"
    1.71 +                      // Thumb does not support the standard ARM conditional
    1.72 +                      // instructions but instead requires the 'it' instruction
    1.73 +                      // to signal conditional execution
    1.74 +                      "it eq                                  \n\t"
    1.75 +                      "moveq      ip, #8                      \n\t"
    1.76 +                      "mov        %[keep_dst], %[dst]         \n\t"
    1.77 +
    1.78 +                      "add        %[src], %[src], ip, LSL#2   \n\t"
    1.79 +                      "add        %[dst], %[dst], ip, LSL#1   \n\t"
    1.80 +                      "subs       %[count], %[count], ip      \n\t"
    1.81 +                      "b          9f                          \n\t"
    1.82 +                      // LOOP
    1.83 +                      "2:                                         \n\t"
    1.84 +
    1.85 +                      "vld1.16    {q12}, [%[dst]]!            \n\t"
    1.86 +                      "vld4.8     {d0-d3}, [%[src]]!          \n\t"
    1.87 +                      "vst1.16    {q10}, [%[keep_dst]]        \n\t"
    1.88 +                      "sub        %[keep_dst], %[dst], #8*2   \n\t"
    1.89 +                      "subs       %[count], %[count], #8      \n\t"
    1.90 +                      "9:                                         \n\t"
    1.91 +                      "pld        [%[dst],#32]                \n\t"
    1.92 +                      // expand 0565 q12 to 8888 {d4-d7}
    1.93 +                      "vmovn.u16  d4, q12                     \n\t"
    1.94 +                      "vshr.u16   q11, q12, #5                \n\t"
    1.95 +                      "vshr.u16   q10, q12, #6+5              \n\t"
    1.96 +                      "vmovn.u16  d5, q11                     \n\t"
    1.97 +                      "vmovn.u16  d6, q10                     \n\t"
    1.98 +                      "vshl.u8    d4, d4, #3                  \n\t"
    1.99 +                      "vshl.u8    d5, d5, #2                  \n\t"
   1.100 +                      "vshl.u8    d6, d6, #3                  \n\t"
   1.101 +
   1.102 +                      "vmovl.u8   q14, d31                    \n\t"
   1.103 +                      "vmovl.u8   q13, d31                    \n\t"
   1.104 +                      "vmovl.u8   q12, d31                    \n\t"
   1.105 +
   1.106 +                      // duplicate in 4/2/1 & 8pix vsns
   1.107 +                      "vmvn.8     d30, d3                     \n\t"
   1.108 +                      "vmlal.u8   q14, d30, d6                \n\t"
   1.109 +                      "vmlal.u8   q13, d30, d5                \n\t"
   1.110 +                      "vmlal.u8   q12, d30, d4                \n\t"
   1.111 +                      "vshr.u16   q8, q14, #5                 \n\t"
   1.112 +                      "vshr.u16   q9, q13, #6                 \n\t"
   1.113 +                      "vaddhn.u16 d6, q14, q8                 \n\t"
   1.114 +                      "vshr.u16   q8, q12, #5                 \n\t"
   1.115 +                      "vaddhn.u16 d5, q13, q9                 \n\t"
   1.116 +                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
   1.117 +                      "vaddhn.u16 d4, q12, q8                 \n\t"
   1.118 +                      // intentionally don't calculate alpha
   1.119 +                      // result in d4-d6
   1.120 +
   1.121 +                      "vqadd.u8   d5, d5, d1                  \n\t"
   1.122 +                      "vqadd.u8   d4, d4, d2                  \n\t"
   1.123 +
   1.124 +                      // pack 8888 {d4-d6} to 0565 q10
   1.125 +                      "vshll.u8   q10, d6, #8                 \n\t"
   1.126 +                      "vshll.u8   q3, d5, #8                  \n\t"
   1.127 +                      "vshll.u8   q2, d4, #8                  \n\t"
   1.128 +                      "vsri.u16   q10, q3, #5                 \n\t"
   1.129 +                      "vsri.u16   q10, q2, #11                \n\t"
   1.130 +
   1.131 +                      "bne        2b                          \n\t"
   1.132 +
   1.133 +                      "1:                                         \n\t"
   1.134 +                      "vst1.16      {q10}, [%[keep_dst]]      \n\t"
   1.135 +                      : [count] "+r" (count)
   1.136 +                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
   1.137 +                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
   1.138 +                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
   1.139 +                      "d30","d31"
   1.140 +                      );
   1.141 +    }
   1.142 +    else
   1.143 +    {   // handle count < 8
   1.144 +        uint16_t* SK_RESTRICT keep_dst = 0;
   1.145 +
   1.146 +        asm volatile (
   1.147 +                      "vmov.u8    d31, #1<<7                  \n\t"
   1.148 +                      "mov        %[keep_dst], %[dst]         \n\t"
   1.149 +
   1.150 +                      "tst        %[count], #4                \n\t"
   1.151 +                      "beq        14f                         \n\t"
   1.152 +                      "vld1.16    {d25}, [%[dst]]!            \n\t"
   1.153 +                      "vld1.32    {q1}, [%[src]]!             \n\t"
   1.154 +
   1.155 +                      "14:                                        \n\t"
   1.156 +                      "tst        %[count], #2                \n\t"
   1.157 +                      "beq        12f                         \n\t"
   1.158 +                      "vld1.32    {d24[1]}, [%[dst]]!         \n\t"
   1.159 +                      "vld1.32    {d1}, [%[src]]!             \n\t"
   1.160 +
   1.161 +                      "12:                                        \n\t"
   1.162 +                      "tst        %[count], #1                \n\t"
   1.163 +                      "beq        11f                         \n\t"
   1.164 +                      "vld1.16    {d24[1]}, [%[dst]]!         \n\t"
   1.165 +                      "vld1.32    {d0[1]}, [%[src]]!          \n\t"
   1.166 +
   1.167 +                      "11:                                        \n\t"
   1.168 +                      // unzips achieve the same as a vld4 operation
   1.169 +                      "vuzpq.u16  q0, q1                      \n\t"
   1.170 +                      "vuzp.u8    d0, d1                      \n\t"
   1.171 +                      "vuzp.u8    d2, d3                      \n\t"
   1.172 +                      // expand 0565 q12 to 8888 {d4-d7}
   1.173 +                      "vmovn.u16  d4, q12                     \n\t"
   1.174 +                      "vshr.u16   q11, q12, #5                \n\t"
   1.175 +                      "vshr.u16   q10, q12, #6+5              \n\t"
   1.176 +                      "vmovn.u16  d5, q11                     \n\t"
   1.177 +                      "vmovn.u16  d6, q10                     \n\t"
   1.178 +                      "vshl.u8    d4, d4, #3                  \n\t"
   1.179 +                      "vshl.u8    d5, d5, #2                  \n\t"
   1.180 +                      "vshl.u8    d6, d6, #3                  \n\t"
   1.181 +
   1.182 +                      "vmovl.u8   q14, d31                    \n\t"
   1.183 +                      "vmovl.u8   q13, d31                    \n\t"
   1.184 +                      "vmovl.u8   q12, d31                    \n\t"
   1.185 +
   1.186 +                      // duplicate in 4/2/1 & 8pix vsns
   1.187 +                      "vmvn.8     d30, d3                     \n\t"
   1.188 +                      "vmlal.u8   q14, d30, d6                \n\t"
   1.189 +                      "vmlal.u8   q13, d30, d5                \n\t"
   1.190 +                      "vmlal.u8   q12, d30, d4                \n\t"
   1.191 +                      "vshr.u16   q8, q14, #5                 \n\t"
   1.192 +                      "vshr.u16   q9, q13, #6                 \n\t"
   1.193 +                      "vaddhn.u16 d6, q14, q8                 \n\t"
   1.194 +                      "vshr.u16   q8, q12, #5                 \n\t"
   1.195 +                      "vaddhn.u16 d5, q13, q9                 \n\t"
   1.196 +                      "vqadd.u8   d6, d6, d0                  \n\t"  // moved up
   1.197 +                      "vaddhn.u16 d4, q12, q8                 \n\t"
   1.198 +                      // intentionally don't calculate alpha
   1.199 +                      // result in d4-d6
   1.200 +
   1.201 +                      "vqadd.u8   d5, d5, d1                  \n\t"
   1.202 +                      "vqadd.u8   d4, d4, d2                  \n\t"
   1.203 +
   1.204 +                      // pack 8888 {d4-d6} to 0565 q10
   1.205 +                      "vshll.u8   q10, d6, #8                 \n\t"
   1.206 +                      "vshll.u8   q3, d5, #8                  \n\t"
   1.207 +                      "vshll.u8   q2, d4, #8                  \n\t"
   1.208 +                      "vsri.u16   q10, q3, #5                 \n\t"
   1.209 +                      "vsri.u16   q10, q2, #11                \n\t"
   1.210 +
   1.211 +                      // store
   1.212 +                      "tst        %[count], #4                \n\t"
   1.213 +                      "beq        24f                         \n\t"
   1.214 +                      "vst1.16    {d21}, [%[keep_dst]]!       \n\t"
   1.215 +
   1.216 +                      "24:                                        \n\t"
   1.217 +                      "tst        %[count], #2                \n\t"
   1.218 +                      "beq        22f                         \n\t"
   1.219 +                      "vst1.32    {d20[1]}, [%[keep_dst]]!    \n\t"
   1.220 +
   1.221 +                      "22:                                        \n\t"
   1.222 +                      "tst        %[count], #1                \n\t"
   1.223 +                      "beq        21f                         \n\t"
   1.224 +                      "vst1.16    {d20[1]}, [%[keep_dst]]!    \n\t"
   1.225 +
   1.226 +                      "21:                                        \n\t"
   1.227 +                      : [count] "+r" (count)
   1.228 +                      : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src)
   1.229 +                      : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7",
   1.230 +                      "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29",
   1.231 +                      "d30","d31"
   1.232 +                      );
   1.233 +    }
   1.234 +}
   1.235 +
   1.236 +static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) {
   1.237 +    prod += vdupq_n_u16(128);
   1.238 +    prod += vshrq_n_u16(prod, 8);
   1.239 +    return vshrq_n_u16(prod, 8);
   1.240 +}
   1.241 +
   1.242 +void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst,
   1.243 +                          const SkPMColor* SK_RESTRICT src, int count,
   1.244 +                          U8CPU alpha, int /*x*/, int /*y*/) {
   1.245 +   SkASSERT(255 > alpha);
   1.246 +
   1.247 +    /* This code implements a Neon version of S32A_D565_Blend. The results have
   1.248 +     * a few mismatches compared to the original code. These mismatches never
   1.249 +     * exceed 1.
   1.250 +     */
   1.251 +
   1.252 +    if (count >= 8) {
   1.253 +        uint16x8_t valpha_max, vmask_blue;
   1.254 +        uint8x8_t valpha;
   1.255 +
   1.256 +        // prepare constants
   1.257 +        valpha_max = vmovq_n_u16(255);
   1.258 +        valpha = vdup_n_u8(alpha);
   1.259 +        vmask_blue = vmovq_n_u16(SK_B16_MASK);
   1.260 +
   1.261 +        do {
   1.262 +            uint16x8_t vdst, vdst_r, vdst_g, vdst_b;
   1.263 +            uint16x8_t vres_a, vres_r, vres_g, vres_b;
   1.264 +            uint8x8x4_t vsrc;
   1.265 +
   1.266 +            // load pixels
   1.267 +            vdst = vld1q_u16(dst);
   1.268 +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
   1.269 +            asm (
   1.270 +                "vld4.u8 %h[vsrc], [%[src]]!"
   1.271 +                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
   1.272 +                : :
   1.273 +            );
   1.274 +#else
   1.275 +            register uint8x8_t d0 asm("d0");
   1.276 +            register uint8x8_t d1 asm("d1");
   1.277 +            register uint8x8_t d2 asm("d2");
   1.278 +            register uint8x8_t d3 asm("d3");
   1.279 +
   1.280 +            asm volatile (
   1.281 +                "vld4.u8    {d0-d3},[%[src]]!;"
   1.282 +                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
   1.283 +                  [src] "+&r" (src)
   1.284 +                : :
   1.285 +            );
   1.286 +            vsrc.val[0] = d0;
   1.287 +            vsrc.val[1] = d1;
   1.288 +            vsrc.val[2] = d2;
   1.289 +            vsrc.val[3] = d3;
   1.290 +#endif
   1.291 +
   1.292 +
   1.293 +            // deinterleave dst
   1.294 +            vdst_g = vshlq_n_u16(vdst, SK_R16_BITS);        // shift green to top of lanes
   1.295 +            vdst_b = vdst & vmask_blue;                     // extract blue
   1.296 +            vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT);       // extract red
   1.297 +            vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green
   1.298 +
   1.299 +            // shift src to 565
   1.300 +            vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS);
   1.301 +            vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS);
   1.302 +            vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS);
   1.303 +
   1.304 +            // calc src * src_scale
   1.305 +            vres_a = vmull_u8(vsrc.val[NEON_A], valpha);
   1.306 +            vres_r = vmull_u8(vsrc.val[NEON_R], valpha);
   1.307 +            vres_g = vmull_u8(vsrc.val[NEON_G], valpha);
   1.308 +            vres_b = vmull_u8(vsrc.val[NEON_B], valpha);
   1.309 +
   1.310 +            // prepare dst_scale
   1.311 +            vres_a = SkDiv255Round_neon8(vres_a);
   1.312 +            vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255
   1.313 +
   1.314 +            // add dst * dst_scale to previous result
   1.315 +            vres_r = vmlaq_u16(vres_r, vdst_r, vres_a);
   1.316 +            vres_g = vmlaq_u16(vres_g, vdst_g, vres_a);
   1.317 +            vres_b = vmlaq_u16(vres_b, vdst_b, vres_a);
   1.318 +
   1.319 +#ifdef S32A_D565_BLEND_EXACT
   1.320 +            // It is possible to get exact results with this but it is slow,
   1.321 +            // even slower than C code in some cases
   1.322 +            vres_r = SkDiv255Round_neon8(vres_r);
   1.323 +            vres_g = SkDiv255Round_neon8(vres_g);
   1.324 +            vres_b = SkDiv255Round_neon8(vres_b);
   1.325 +#else
   1.326 +            vres_r = vrshrq_n_u16(vres_r, 8);
   1.327 +            vres_g = vrshrq_n_u16(vres_g, 8);
   1.328 +            vres_b = vrshrq_n_u16(vres_b, 8);
   1.329 +#endif
   1.330 +            // pack result
   1.331 +            vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue
   1.332 +            vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue
   1.333 +
   1.334 +            // store
   1.335 +            vst1q_u16(dst, vres_b);
   1.336 +            dst += 8;
   1.337 +            count -= 8;
   1.338 +        } while (count >= 8);
   1.339 +    }
   1.340 +
   1.341 +    // leftovers
   1.342 +    while (count-- > 0) {
   1.343 +        SkPMColor sc = *src++;
   1.344 +        if (sc) {
   1.345 +            uint16_t dc = *dst;
   1.346 +            unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha);
   1.347 +            unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale);
   1.348 +            unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale);
   1.349 +            unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale);
   1.350 +            *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db));
   1.351 +        }
   1.352 +        dst += 1;
   1.353 +    }
   1.354 +}
   1.355 +
   1.356 +/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16.
   1.357 + * each dither value is spaced out into byte lanes, and repeated
   1.358 + * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the
   1.359 + * start of each row.
   1.360 + */
   1.361 +static const uint8_t gDitherMatrix_Neon[48] = {
   1.362 +    0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5,
   1.363 +    6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3,
   1.364 +    1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4,
   1.365 +    7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2,
   1.366 +
   1.367 +};
   1.368 +
   1.369 +void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src,
   1.370 +                                int count, U8CPU alpha, int x, int y)
   1.371 +{
   1.372 +
   1.373 +    SkASSERT(255 > alpha);
   1.374 +
   1.375 +    // rescale alpha to range 1 - 256
   1.376 +    int scale = SkAlpha255To256(alpha);
   1.377 +
   1.378 +    if (count >= 8) {
   1.379 +        /* select row and offset for dither array */
   1.380 +        const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1.381 +
   1.382 +        uint8x8_t vdither = vld1_u8(dstart);         // load dither values
   1.383 +        uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values
   1.384 +
   1.385 +        int16x8_t vscale = vdupq_n_s16(scale);        // duplicate scale into neon reg
   1.386 +        uint16x8_t vmask_b = vdupq_n_u16(0x1F);         // set up blue mask
   1.387 +
   1.388 +        do {
   1.389 +
   1.390 +            uint8x8_t vsrc_r, vsrc_g, vsrc_b;
   1.391 +            uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b;
   1.392 +            uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b;
   1.393 +            uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b;
   1.394 +            uint16x8_t vdst;
   1.395 +            uint16x8_t vdst_r, vdst_g, vdst_b;
   1.396 +            int16x8_t vres_r, vres_g, vres_b;
   1.397 +            int8x8_t vres8_r, vres8_g, vres8_b;
   1.398 +
   1.399 +            // Load source and add dither
   1.400 +            {
   1.401 +            register uint8x8_t d0 asm("d0");
   1.402 +            register uint8x8_t d1 asm("d1");
   1.403 +            register uint8x8_t d2 asm("d2");
   1.404 +            register uint8x8_t d3 asm("d3");
   1.405 +
   1.406 +            asm (
   1.407 +                "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
   1.408 +                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
   1.409 +                :
   1.410 +            );
   1.411 +            vsrc_g = d1;
   1.412 +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
   1.413 +            vsrc_r = d2; vsrc_b = d0;
   1.414 +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
   1.415 +            vsrc_r = d0; vsrc_b = d2;
   1.416 +#endif
   1.417 +            }
   1.418 +
   1.419 +            vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6
   1.420 +            vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5
   1.421 +            vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5
   1.422 +
   1.423 +            vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen
   1.424 +            vsrc_dit_r = vaddl_u8(vsrc_r, vdither);   // add in dither to red and widen
   1.425 +            vsrc_dit_b = vaddl_u8(vsrc_b, vdither);   // add in dither to blue and widen
   1.426 +
   1.427 +            vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r);  // sub shifted red from result
   1.428 +            vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g);  // sub shifted green from result
   1.429 +            vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b);  // sub shifted blue from result
   1.430 +
   1.431 +            vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3);
   1.432 +            vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2);
   1.433 +            vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3);
   1.434 +
   1.435 +            // Load dst and unpack
   1.436 +            vdst = vld1q_u16(dst);
   1.437 +            vdst_g = vshrq_n_u16(vdst, 5);                   // shift down to get green
   1.438 +            vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red
   1.439 +            vdst_b = vandq_u16(vdst, vmask_b);               // mask to get blue
   1.440 +
   1.441 +            // subtract dst from src and widen
   1.442 +            vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r));
   1.443 +            vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g));
   1.444 +            vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b));
   1.445 +
   1.446 +            // multiply diffs by scale and shift
   1.447 +            vres_r = vmulq_s16(vres_r, vscale);
   1.448 +            vres_g = vmulq_s16(vres_g, vscale);
   1.449 +            vres_b = vmulq_s16(vres_b, vscale);
   1.450 +
   1.451 +            vres8_r = vshrn_n_s16(vres_r, 8);
   1.452 +            vres8_g = vshrn_n_s16(vres_g, 8);
   1.453 +            vres8_b = vshrn_n_s16(vres_b, 8);
   1.454 +
   1.455 +            // add dst to result
   1.456 +            vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r);
   1.457 +            vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g);
   1.458 +            vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b);
   1.459 +
   1.460 +            // put result into 565 format
   1.461 +            vres_b = vsliq_n_s16(vres_b, vres_g, 5);   // shift up green and insert into blue
   1.462 +            vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue
   1.463 +
   1.464 +            // Store result
   1.465 +            vst1q_u16(dst, vreinterpretq_u16_s16(vres_b));
   1.466 +
   1.467 +            // Next iteration
   1.468 +            dst += 8;
   1.469 +            count -= 8;
   1.470 +
   1.471 +        } while (count >= 8);
   1.472 +    }
   1.473 +
   1.474 +    // Leftovers
   1.475 +    if (count > 0) {
   1.476 +        int scale = SkAlpha255To256(alpha);
   1.477 +        DITHER_565_SCAN(y);
   1.478 +        do {
   1.479 +            SkPMColor c = *src++;
   1.480 +            SkPMColorAssert(c);
   1.481 +
   1.482 +            int dither = DITHER_VALUE(x);
   1.483 +            int sr = SkGetPackedR32(c);
   1.484 +            int sg = SkGetPackedG32(c);
   1.485 +            int sb = SkGetPackedB32(c);
   1.486 +            sr = SkDITHER_R32To565(sr, dither);
   1.487 +            sg = SkDITHER_G32To565(sg, dither);
   1.488 +            sb = SkDITHER_B32To565(sb, dither);
   1.489 +
   1.490 +            uint16_t d = *dst;
   1.491 +            *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale),
   1.492 +                                 SkAlphaBlend(sg, SkGetPackedG16(d), scale),
   1.493 +                                 SkAlphaBlend(sb, SkGetPackedB16(d), scale));
   1.494 +            DITHER_INC_X(x);
   1.495 +        } while (--count != 0);
   1.496 +    }
   1.497 +}
   1.498 +
   1.499 +void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1.500 +                                const SkPMColor* SK_RESTRICT src,
   1.501 +                                int count, U8CPU alpha) {
   1.502 +
   1.503 +    SkASSERT(255 == alpha);
   1.504 +    if (count > 0) {
   1.505 +
   1.506 +
   1.507 +    uint8x8_t alpha_mask;
   1.508 +
   1.509 +    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1.510 +    alpha_mask = vld1_u8(alpha_mask_setup);
   1.511 +
   1.512 +    /* do the NEON unrolled code */
   1.513 +#define    UNROLL    4
   1.514 +    while (count >= UNROLL) {
   1.515 +        uint8x8_t src_raw, dst_raw, dst_final;
   1.516 +        uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
   1.517 +
   1.518 +        /* The two prefetches below may make the code slighlty
   1.519 +         * slower for small values of count but are worth having
   1.520 +         * in the general case.
   1.521 +         */
   1.522 +        __builtin_prefetch(src+32);
   1.523 +        __builtin_prefetch(dst+32);
   1.524 +
   1.525 +        /* get the source */
   1.526 +        src_raw = vreinterpret_u8_u32(vld1_u32(src));
   1.527 +#if    UNROLL > 2
   1.528 +        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
   1.529 +#endif
   1.530 +
   1.531 +        /* get and hold the dst too */
   1.532 +        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   1.533 +#if    UNROLL > 2
   1.534 +        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
   1.535 +#endif
   1.536 +
   1.537 +    /* 1st and 2nd bits of the unrolling */
   1.538 +    {
   1.539 +        uint8x8_t dst_cooked;
   1.540 +        uint16x8_t dst_wide;
   1.541 +        uint8x8_t alpha_narrow;
   1.542 +        uint16x8_t alpha_wide;
   1.543 +
   1.544 +        /* get the alphas spread out properly */
   1.545 +        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
   1.546 +        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1.547 +
   1.548 +        /* spread the dest */
   1.549 +        dst_wide = vmovl_u8(dst_raw);
   1.550 +
   1.551 +        /* alpha mul the dest */
   1.552 +        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1.553 +        dst_cooked = vshrn_n_u16(dst_wide, 8);
   1.554 +
   1.555 +        /* sum -- ignoring any byte lane overflows */
   1.556 +        dst_final = vadd_u8(src_raw, dst_cooked);
   1.557 +    }
   1.558 +
   1.559 +#if    UNROLL > 2
   1.560 +    /* the 3rd and 4th bits of our unrolling */
   1.561 +    {
   1.562 +        uint8x8_t dst_cooked;
   1.563 +        uint16x8_t dst_wide;
   1.564 +        uint8x8_t alpha_narrow;
   1.565 +        uint16x8_t alpha_wide;
   1.566 +
   1.567 +        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
   1.568 +        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1.569 +
   1.570 +        /* spread the dest */
   1.571 +        dst_wide = vmovl_u8(dst_raw_2);
   1.572 +
   1.573 +        /* alpha mul the dest */
   1.574 +        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1.575 +        dst_cooked = vshrn_n_u16(dst_wide, 8);
   1.576 +
   1.577 +        /* sum -- ignoring any byte lane overflows */
   1.578 +        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
   1.579 +    }
   1.580 +#endif
   1.581 +
   1.582 +        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   1.583 +#if    UNROLL > 2
   1.584 +        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
   1.585 +#endif
   1.586 +
   1.587 +        src += UNROLL;
   1.588 +        dst += UNROLL;
   1.589 +        count -= UNROLL;
   1.590 +    }
   1.591 +#undef    UNROLL
   1.592 +
   1.593 +    /* do any residual iterations */
   1.594 +        while (--count >= 0) {
   1.595 +            *dst = SkPMSrcOver(*src, *dst);
   1.596 +            src += 1;
   1.597 +            dst += 1;
   1.598 +        }
   1.599 +    }
   1.600 +}
   1.601 +
   1.602 +void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst,
   1.603 +                                const SkPMColor* SK_RESTRICT src,
   1.604 +                                int count, U8CPU alpha) {
   1.605 +    SkASSERT(255 == alpha);
   1.606 +
   1.607 +    if (count <= 0)
   1.608 +    return;
   1.609 +
   1.610 +    /* Use these to check if src is transparent or opaque */
   1.611 +    const unsigned int ALPHA_OPAQ  = 0xFF000000;
   1.612 +    const unsigned int ALPHA_TRANS = 0x00FFFFFF;
   1.613 +
   1.614 +#define UNROLL  4
   1.615 +    const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1);
   1.616 +    const SkPMColor* SK_RESTRICT src_temp = src;
   1.617 +
   1.618 +    /* set up the NEON variables */
   1.619 +    uint8x8_t alpha_mask;
   1.620 +    static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1.621 +    alpha_mask = vld1_u8(alpha_mask_setup);
   1.622 +
   1.623 +    uint8x8_t src_raw, dst_raw, dst_final;
   1.624 +    uint8x8_t src_raw_2, dst_raw_2, dst_final_2;
   1.625 +    uint8x8_t dst_cooked;
   1.626 +    uint16x8_t dst_wide;
   1.627 +    uint8x8_t alpha_narrow;
   1.628 +    uint16x8_t alpha_wide;
   1.629 +
   1.630 +    /* choose the first processing type */
   1.631 +    if( src >= src_end)
   1.632 +        goto TAIL;
   1.633 +    if(*src <= ALPHA_TRANS)
   1.634 +        goto ALPHA_0;
   1.635 +    if(*src >= ALPHA_OPAQ)
   1.636 +        goto ALPHA_255;
   1.637 +    /* fall-thru */
   1.638 +
   1.639 +ALPHA_1_TO_254:
   1.640 +    do {
   1.641 +
   1.642 +        /* get the source */
   1.643 +        src_raw = vreinterpret_u8_u32(vld1_u32(src));
   1.644 +        src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2));
   1.645 +
   1.646 +        /* get and hold the dst too */
   1.647 +        dst_raw = vreinterpret_u8_u32(vld1_u32(dst));
   1.648 +        dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2));
   1.649 +
   1.650 +
   1.651 +        /* get the alphas spread out properly */
   1.652 +        alpha_narrow = vtbl1_u8(src_raw, alpha_mask);
   1.653 +        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   1.654 +        /* we collapsed (255-a)+1 ... */
   1.655 +        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1.656 +
   1.657 +        /* spread the dest */
   1.658 +        dst_wide = vmovl_u8(dst_raw);
   1.659 +
   1.660 +        /* alpha mul the dest */
   1.661 +        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1.662 +        dst_cooked = vshrn_n_u16(dst_wide, 8);
   1.663 +
   1.664 +        /* sum -- ignoring any byte lane overflows */
   1.665 +        dst_final = vadd_u8(src_raw, dst_cooked);
   1.666 +
   1.667 +        alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask);
   1.668 +        /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */
   1.669 +        /* we collapsed (255-a)+1 ... */
   1.670 +        alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow);
   1.671 +
   1.672 +        /* spread the dest */
   1.673 +        dst_wide = vmovl_u8(dst_raw_2);
   1.674 +
   1.675 +        /* alpha mul the dest */
   1.676 +        dst_wide = vmulq_u16 (dst_wide, alpha_wide);
   1.677 +        dst_cooked = vshrn_n_u16(dst_wide, 8);
   1.678 +
   1.679 +        /* sum -- ignoring any byte lane overflows */
   1.680 +        dst_final_2 = vadd_u8(src_raw_2, dst_cooked);
   1.681 +
   1.682 +        vst1_u32(dst, vreinterpret_u32_u8(dst_final));
   1.683 +        vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2));
   1.684 +
   1.685 +        src += UNROLL;
   1.686 +        dst += UNROLL;
   1.687 +
   1.688 +        /* if 2 of the next pixels aren't between 1 and 254
   1.689 +        it might make sense to go to the optimized loops */
   1.690 +        if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ))
   1.691 +            break;
   1.692 +
   1.693 +    } while(src < src_end);
   1.694 +
   1.695 +    if (src >= src_end)
   1.696 +        goto TAIL;
   1.697 +
   1.698 +    if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)
   1.699 +        goto ALPHA_255;
   1.700 +
   1.701 +    /*fall-thru*/
   1.702 +
   1.703 +ALPHA_0:
   1.704 +
   1.705 +    /*In this state, we know the current alpha is 0 and
   1.706 +     we optimize for the next alpha also being zero. */
   1.707 +    src_temp = src;  //so we don't have to increment dst every time
   1.708 +    do {
   1.709 +        if(*(++src) > ALPHA_TRANS)
   1.710 +            break;
   1.711 +        if(*(++src) > ALPHA_TRANS)
   1.712 +            break;
   1.713 +        if(*(++src) > ALPHA_TRANS)
   1.714 +            break;
   1.715 +        if(*(++src) > ALPHA_TRANS)
   1.716 +            break;
   1.717 +    } while(src < src_end);
   1.718 +
   1.719 +    dst += (src - src_temp);
   1.720 +
   1.721 +    /* no longer alpha 0, so determine where to go next. */
   1.722 +    if( src >= src_end)
   1.723 +        goto TAIL;
   1.724 +    if(*src >= ALPHA_OPAQ)
   1.725 +        goto ALPHA_255;
   1.726 +    else
   1.727 +        goto ALPHA_1_TO_254;
   1.728 +
   1.729 +ALPHA_255:
   1.730 +    while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) {
   1.731 +        dst[0]=src[0];
   1.732 +        dst[1]=src[1];
   1.733 +        dst[2]=src[2];
   1.734 +        dst[3]=src[3];
   1.735 +        src+=UNROLL;
   1.736 +        dst+=UNROLL;
   1.737 +        if(src >= src_end)
   1.738 +            goto TAIL;
   1.739 +    }
   1.740 +
   1.741 +    //Handle remainder.
   1.742 +    if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   1.743 +        if(*src >= ALPHA_OPAQ) { *dst++ = *src++;
   1.744 +            if(*src >= ALPHA_OPAQ) { *dst++ = *src++; }
   1.745 +        }
   1.746 +    }
   1.747 +
   1.748 +    if( src >= src_end)
   1.749 +        goto TAIL;
   1.750 +    if(*src <= ALPHA_TRANS)
   1.751 +        goto ALPHA_0;
   1.752 +    else
   1.753 +        goto ALPHA_1_TO_254;
   1.754 +
   1.755 +TAIL:
   1.756 +    /* do any residual iterations */
   1.757 +    src_end += UNROLL + 1;  //goto the real end
   1.758 +    while(src != src_end) {
   1.759 +        if( *src != 0 ) {
   1.760 +            if( *src >= ALPHA_OPAQ ) {
   1.761 +                *dst = *src;
   1.762 +            }
   1.763 +            else {
   1.764 +                *dst = SkPMSrcOver(*src, *dst);
   1.765 +            }
   1.766 +        }
   1.767 +        src++;
   1.768 +        dst++;
   1.769 +    }
   1.770 +
   1.771 +#undef    UNROLL
   1.772 +    return;
   1.773 +}
   1.774 +
   1.775 +/* Neon version of S32_Blend_BlitRow32()
   1.776 + * portable version is in src/core/SkBlitRow_D32.cpp
   1.777 + */
   1.778 +void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1.779 +                              const SkPMColor* SK_RESTRICT src,
   1.780 +                              int count, U8CPU alpha) {
   1.781 +    SkASSERT(alpha <= 255);
   1.782 +
   1.783 +    if (count <= 0) {
   1.784 +        return;
   1.785 +    }
   1.786 +
   1.787 +    uint16_t src_scale = SkAlpha255To256(alpha);
   1.788 +    uint16_t dst_scale = 256 - src_scale;
   1.789 +
   1.790 +    while (count >= 2) {
   1.791 +        uint8x8_t vsrc, vdst, vres;
   1.792 +        uint16x8_t vsrc_wide, vdst_wide;
   1.793 +
   1.794 +        /* These commented prefetches are a big win for count
   1.795 +         * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4.
   1.796 +         * They also hurt a little (<5%) on an A15
   1.797 +         */
   1.798 +        //__builtin_prefetch(src+32);
   1.799 +        //__builtin_prefetch(dst+32);
   1.800 +
   1.801 +        // Load
   1.802 +        vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1.803 +        vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1.804 +
   1.805 +        // Process src
   1.806 +        vsrc_wide = vmovl_u8(vsrc);
   1.807 +        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1.808 +
   1.809 +        // Process dst
   1.810 +        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1.811 +
   1.812 +        // Combine
   1.813 +        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1.814 +
   1.815 +        // Store
   1.816 +        vst1_u32(dst, vreinterpret_u32_u8(vres));
   1.817 +
   1.818 +        src += 2;
   1.819 +        dst += 2;
   1.820 +        count -= 2;
   1.821 +    }
   1.822 +
   1.823 +    if (count == 1) {
   1.824 +        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1.825 +        uint16x8_t vsrc_wide, vdst_wide;
   1.826 +
   1.827 +        // Load
   1.828 +        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1.829 +        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1.830 +
   1.831 +        // Process
   1.832 +        vsrc_wide = vmovl_u8(vsrc);
   1.833 +        vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale));
   1.834 +        vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale));
   1.835 +        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1.836 +
   1.837 +        // Store
   1.838 +        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1.839 +    }
   1.840 +}
   1.841 +
   1.842 +void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst,
   1.843 +                         const SkPMColor* SK_RESTRICT src,
   1.844 +                         int count, U8CPU alpha) {
   1.845 +
   1.846 +    SkASSERT(255 >= alpha);
   1.847 +
   1.848 +    if (count <= 0) {
   1.849 +        return;
   1.850 +    }
   1.851 +
   1.852 +    unsigned alpha256 = SkAlpha255To256(alpha);
   1.853 +
   1.854 +    // First deal with odd counts
   1.855 +    if (count & 1) {
   1.856 +        uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres;
   1.857 +        uint16x8_t vdst_wide, vsrc_wide;
   1.858 +        unsigned dst_scale;
   1.859 +
   1.860 +        // Load
   1.861 +        vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0));
   1.862 +        vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0));
   1.863 +
   1.864 +        // Calc dst_scale
   1.865 +        dst_scale = vget_lane_u8(vsrc, 3);
   1.866 +        dst_scale *= alpha256;
   1.867 +        dst_scale >>= 8;
   1.868 +        dst_scale = 256 - dst_scale;
   1.869 +
   1.870 +        // Process src
   1.871 +        vsrc_wide = vmovl_u8(vsrc);
   1.872 +        vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256);
   1.873 +
   1.874 +        // Process dst
   1.875 +        vdst_wide = vmovl_u8(vdst);
   1.876 +        vdst_wide = vmulq_n_u16(vdst_wide, dst_scale);
   1.877 +
   1.878 +        // Combine
   1.879 +        vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1.880 +
   1.881 +        vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0);
   1.882 +        dst++;
   1.883 +        src++;
   1.884 +        count--;
   1.885 +    }
   1.886 +
   1.887 +    if (count) {
   1.888 +        uint8x8_t alpha_mask;
   1.889 +        static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7};
   1.890 +        alpha_mask = vld1_u8(alpha_mask_setup);
   1.891 +
   1.892 +        do {
   1.893 +
   1.894 +            uint8x8_t vsrc, vdst, vres, vsrc_alphas;
   1.895 +            uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale;
   1.896 +
   1.897 +            __builtin_prefetch(src+32);
   1.898 +            __builtin_prefetch(dst+32);
   1.899 +
   1.900 +            // Load
   1.901 +            vsrc = vreinterpret_u8_u32(vld1_u32(src));
   1.902 +            vdst = vreinterpret_u8_u32(vld1_u32(dst));
   1.903 +
   1.904 +            // Prepare src_scale
   1.905 +            vsrc_scale = vdupq_n_u16(alpha256);
   1.906 +
   1.907 +            // Calc dst_scale
   1.908 +            vsrc_alphas = vtbl1_u8(vsrc, alpha_mask);
   1.909 +            vdst_scale = vmovl_u8(vsrc_alphas);
   1.910 +            vdst_scale *= vsrc_scale;
   1.911 +            vdst_scale = vshrq_n_u16(vdst_scale, 8);
   1.912 +            vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale);
   1.913 +
   1.914 +            // Process src
   1.915 +            vsrc_wide = vmovl_u8(vsrc);
   1.916 +            vsrc_wide *= vsrc_scale;
   1.917 +
   1.918 +            // Process dst
   1.919 +            vdst_wide = vmovl_u8(vdst);
   1.920 +            vdst_wide *= vdst_scale;
   1.921 +
   1.922 +            // Combine
   1.923 +            vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8);
   1.924 +
   1.925 +            vst1_u32(dst, vreinterpret_u32_u8(vres));
   1.926 +
   1.927 +            src += 2;
   1.928 +            dst += 2;
   1.929 +            count -= 2;
   1.930 +        } while(count);
   1.931 +    }
   1.932 +}
   1.933 +
   1.934 +///////////////////////////////////////////////////////////////////////////////
   1.935 +
   1.936 +#undef    DEBUG_OPAQUE_DITHER
   1.937 +
   1.938 +#if    defined(DEBUG_OPAQUE_DITHER)
   1.939 +static void showme8(char *str, void *p, int len)
   1.940 +{
   1.941 +    static char buf[256];
   1.942 +    char tbuf[32];
   1.943 +    int i;
   1.944 +    char *pc = (char*) p;
   1.945 +    sprintf(buf,"%8s:", str);
   1.946 +    for(i=0;i<len;i++) {
   1.947 +        sprintf(tbuf, "   %02x", pc[i]);
   1.948 +        strcat(buf, tbuf);
   1.949 +    }
   1.950 +    SkDebugf("%s\n", buf);
   1.951 +}
   1.952 +static void showme16(char *str, void *p, int len)
   1.953 +{
   1.954 +    static char buf[256];
   1.955 +    char tbuf[32];
   1.956 +    int i;
   1.957 +    uint16_t *pc = (uint16_t*) p;
   1.958 +    sprintf(buf,"%8s:", str);
   1.959 +    len = (len / sizeof(uint16_t));    /* passed as bytes */
   1.960 +    for(i=0;i<len;i++) {
   1.961 +        sprintf(tbuf, " %04x", pc[i]);
   1.962 +        strcat(buf, tbuf);
   1.963 +    }
   1.964 +    SkDebugf("%s\n", buf);
   1.965 +}
   1.966 +#endif
   1.967 +
   1.968 +void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst,
   1.969 +                                   const SkPMColor* SK_RESTRICT src,
   1.970 +                                   int count, U8CPU alpha, int x, int y) {
   1.971 +    SkASSERT(255 == alpha);
   1.972 +
   1.973 +#define    UNROLL    8
   1.974 +
   1.975 +    if (count >= UNROLL) {
   1.976 +
   1.977 +#if defined(DEBUG_OPAQUE_DITHER)
   1.978 +    uint16_t tmpbuf[UNROLL];
   1.979 +    int td[UNROLL];
   1.980 +    int tdv[UNROLL];
   1.981 +    int ta[UNROLL];
   1.982 +    int tap[UNROLL];
   1.983 +    uint16_t in_dst[UNROLL];
   1.984 +    int offset = 0;
   1.985 +    int noisy = 0;
   1.986 +#endif
   1.987 +
   1.988 +    uint8x8_t dbase;
   1.989 +    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
   1.990 +    dbase = vld1_u8(dstart);
   1.991 +
   1.992 +        do {
   1.993 +        uint8x8_t sr, sg, sb, sa, d;
   1.994 +        uint16x8_t dst8, scale8, alpha8;
   1.995 +        uint16x8_t dst_r, dst_g, dst_b;
   1.996 +
   1.997 +#if defined(DEBUG_OPAQUE_DITHER)
   1.998 +        // calculate 8 elements worth into a temp buffer
   1.999 +        {
  1.1000 +        int my_y = y;
  1.1001 +        int my_x = x;
  1.1002 +        SkPMColor* my_src = (SkPMColor*)src;
  1.1003 +        uint16_t* my_dst = dst;
  1.1004 +        int i;
  1.1005 +
  1.1006 +        DITHER_565_SCAN(my_y);
  1.1007 +        for(i = 0; i < UNROLL; i++) {
  1.1008 +            SkPMColor c = *my_src++;
  1.1009 +            SkPMColorAssert(c);
  1.1010 +            if (c) {
  1.1011 +                unsigned a = SkGetPackedA32(c);
  1.1012 +
  1.1013 +                int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a));
  1.1014 +                tdv[i] = DITHER_VALUE(my_x);
  1.1015 +                ta[i] = a;
  1.1016 +                tap[i] = SkAlpha255To256(a);
  1.1017 +                td[i] = d;
  1.1018 +
  1.1019 +                unsigned sr = SkGetPackedR32(c);
  1.1020 +                unsigned sg = SkGetPackedG32(c);
  1.1021 +                unsigned sb = SkGetPackedB32(c);
  1.1022 +                sr = SkDITHER_R32_FOR_565(sr, d);
  1.1023 +                sg = SkDITHER_G32_FOR_565(sg, d);
  1.1024 +                sb = SkDITHER_B32_FOR_565(sb, d);
  1.1025 +
  1.1026 +                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1.1027 +                uint32_t dst_expanded = SkExpand_rgb_16(*my_dst);
  1.1028 +                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1.1029 +                // now src and dst expanded are in g:11 r:10 x:1 b:10
  1.1030 +                tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1.1031 +                td[i] = d;
  1.1032 +            } else {
  1.1033 +                tmpbuf[i] = *my_dst;
  1.1034 +                ta[i] = tdv[i] = td[i] = 0xbeef;
  1.1035 +            }
  1.1036 +            in_dst[i] = *my_dst;
  1.1037 +            my_dst += 1;
  1.1038 +            DITHER_INC_X(my_x);
  1.1039 +        }
  1.1040 +        }
  1.1041 +#endif
  1.1042 +
  1.1043 +
  1.1044 +        {
  1.1045 +        register uint8x8_t d0 asm("d0");
  1.1046 +        register uint8x8_t d1 asm("d1");
  1.1047 +        register uint8x8_t d2 asm("d2");
  1.1048 +        register uint8x8_t d3 asm("d3");
  1.1049 +
  1.1050 +        asm ("vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
  1.1051 +            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src)
  1.1052 +            :
  1.1053 +        );
  1.1054 +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
  1.1055 +            sr = d2; sg = d1; sb = d0; sa = d3;
  1.1056 +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
  1.1057 +            sr = d0; sg = d1; sb = d2; sa = d3;
  1.1058 +#endif
  1.1059 +        }
  1.1060 +
  1.1061 +        /* calculate 'd', which will be 0..7
  1.1062 +         * dbase[] is 0..7; alpha is 0..256; 16 bits suffice
  1.1063 +         */
  1.1064 +        alpha8 = vmovl_u8(dbase);
  1.1065 +        alpha8 = vmlal_u8(alpha8, sa, dbase);
  1.1066 +        d = vshrn_n_u16(alpha8, 8);    // narrowing too
  1.1067 +
  1.1068 +        // sr = sr - (sr>>5) + d
  1.1069 +        /* watching for 8-bit overflow.  d is 0..7; risky range of
  1.1070 +         * sr is >248; and then (sr>>5) is 7 so it offsets 'd';
  1.1071 +         * safe  as long as we do ((sr-sr>>5) + d)
  1.1072 +         */
  1.1073 +        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
  1.1074 +        sr = vadd_u8(sr, d);
  1.1075 +
  1.1076 +        // sb = sb - (sb>>5) + d
  1.1077 +        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
  1.1078 +        sb = vadd_u8(sb, d);
  1.1079 +
  1.1080 +        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
  1.1081 +        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
  1.1082 +        sg = vadd_u8(sg, vshr_n_u8(d,1));
  1.1083 +
  1.1084 +        // need to pick up 8 dst's -- at 16 bits each, 128 bits
  1.1085 +        dst8 = vld1q_u16(dst);
  1.1086 +        dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK));
  1.1087 +        dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS);
  1.1088 +        dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT);    // clearing hi bits
  1.1089 +
  1.1090 +        // blend
  1.1091 +        scale8 = vsubw_u8(vdupq_n_u16(256), sa);
  1.1092 +
  1.1093 +        // combine the addq and mul, save 3 insns
  1.1094 +        scale8 = vshrq_n_u16(scale8, 3);
  1.1095 +        dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8);
  1.1096 +        dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8);
  1.1097 +        dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8);
  1.1098 +
  1.1099 +        // repack to store
  1.1100 +        dst8 = vshrq_n_u16(dst_b, 5);
  1.1101 +        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5);
  1.1102 +        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11);
  1.1103 +
  1.1104 +        vst1q_u16(dst, dst8);
  1.1105 +
  1.1106 +#if defined(DEBUG_OPAQUE_DITHER)
  1.1107 +        // verify my 8 elements match the temp buffer
  1.1108 +        {
  1.1109 +        int i, bad=0;
  1.1110 +        static int invocation;
  1.1111 +
  1.1112 +        for (i = 0; i < UNROLL; i++) {
  1.1113 +            if (tmpbuf[i] != dst[i]) {
  1.1114 +                bad=1;
  1.1115 +            }
  1.1116 +        }
  1.1117 +        if (bad) {
  1.1118 +            SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n",
  1.1119 +                     invocation, offset);
  1.1120 +            SkDebugf("  alpha 0x%x\n", alpha);
  1.1121 +            for (i = 0; i < UNROLL; i++)
  1.1122 +                SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n",
  1.1123 +                         i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i],
  1.1124 +                         in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]);
  1.1125 +
  1.1126 +            showme16("alpha8", &alpha8, sizeof(alpha8));
  1.1127 +            showme16("scale8", &scale8, sizeof(scale8));
  1.1128 +            showme8("d", &d, sizeof(d));
  1.1129 +            showme16("dst8", &dst8, sizeof(dst8));
  1.1130 +            showme16("dst_b", &dst_b, sizeof(dst_b));
  1.1131 +            showme16("dst_g", &dst_g, sizeof(dst_g));
  1.1132 +            showme16("dst_r", &dst_r, sizeof(dst_r));
  1.1133 +            showme8("sb", &sb, sizeof(sb));
  1.1134 +            showme8("sg", &sg, sizeof(sg));
  1.1135 +            showme8("sr", &sr, sizeof(sr));
  1.1136 +
  1.1137 +            return;
  1.1138 +        }
  1.1139 +        offset += UNROLL;
  1.1140 +        invocation++;
  1.1141 +        }
  1.1142 +#endif
  1.1143 +        dst += UNROLL;
  1.1144 +        count -= UNROLL;
  1.1145 +        // skip x += UNROLL, since it's unchanged mod-4
  1.1146 +        } while (count >= UNROLL);
  1.1147 +    }
  1.1148 +#undef    UNROLL
  1.1149 +
  1.1150 +    // residuals
  1.1151 +    if (count > 0) {
  1.1152 +        DITHER_565_SCAN(y);
  1.1153 +        do {
  1.1154 +            SkPMColor c = *src++;
  1.1155 +            SkPMColorAssert(c);
  1.1156 +            if (c) {
  1.1157 +                unsigned a = SkGetPackedA32(c);
  1.1158 +
  1.1159 +                // dither and alpha are just temporary variables to work-around
  1.1160 +                // an ICE in debug.
  1.1161 +                unsigned dither = DITHER_VALUE(x);
  1.1162 +                unsigned alpha = SkAlpha255To256(a);
  1.1163 +                int d = SkAlphaMul(dither, alpha);
  1.1164 +
  1.1165 +                unsigned sr = SkGetPackedR32(c);
  1.1166 +                unsigned sg = SkGetPackedG32(c);
  1.1167 +                unsigned sb = SkGetPackedB32(c);
  1.1168 +                sr = SkDITHER_R32_FOR_565(sr, d);
  1.1169 +                sg = SkDITHER_G32_FOR_565(sg, d);
  1.1170 +                sb = SkDITHER_B32_FOR_565(sb, d);
  1.1171 +
  1.1172 +                uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2);
  1.1173 +                uint32_t dst_expanded = SkExpand_rgb_16(*dst);
  1.1174 +                dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3);
  1.1175 +                // now src and dst expanded are in g:11 r:10 x:1 b:10
  1.1176 +                *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5);
  1.1177 +            }
  1.1178 +            dst += 1;
  1.1179 +            DITHER_INC_X(x);
  1.1180 +        } while (--count != 0);
  1.1181 +    }
  1.1182 +}
  1.1183 +
  1.1184 +///////////////////////////////////////////////////////////////////////////////
  1.1185 +
  1.1186 +#undef    DEBUG_S32_OPAQUE_DITHER
  1.1187 +
  1.1188 +void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst,
  1.1189 +                                 const SkPMColor* SK_RESTRICT src,
  1.1190 +                                 int count, U8CPU alpha, int x, int y) {
  1.1191 +    SkASSERT(255 == alpha);
  1.1192 +
  1.1193 +#define    UNROLL    8
  1.1194 +    if (count >= UNROLL) {
  1.1195 +    uint8x8_t d;
  1.1196 +    const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)];
  1.1197 +    d = vld1_u8(dstart);
  1.1198 +
  1.1199 +    while (count >= UNROLL) {
  1.1200 +        uint8x8_t sr, sg, sb;
  1.1201 +        uint16x8_t dr, dg, db;
  1.1202 +        uint16x8_t dst8;
  1.1203 +
  1.1204 +        {
  1.1205 +        register uint8x8_t d0 asm("d0");
  1.1206 +        register uint8x8_t d1 asm("d1");
  1.1207 +        register uint8x8_t d2 asm("d2");
  1.1208 +        register uint8x8_t d3 asm("d3");
  1.1209 +
  1.1210 +        asm (
  1.1211 +            "vld4.8    {d0-d3},[%[src]]!  /* r=%P0 g=%P1 b=%P2 a=%P3 */"
  1.1212 +            : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src)
  1.1213 +            :
  1.1214 +        );
  1.1215 +        sg = d1;
  1.1216 +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A)
  1.1217 +        sr = d2; sb = d0;
  1.1218 +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A)
  1.1219 +        sr = d0; sb = d2;
  1.1220 +#endif
  1.1221 +        }
  1.1222 +        /* XXX: if we want to prefetch, hide it in the above asm()
  1.1223 +         * using the gcc __builtin_prefetch(), the prefetch will
  1.1224 +         * fall to the bottom of the loop -- it won't stick up
  1.1225 +         * at the top of the loop, just after the vld4.
  1.1226 +         */
  1.1227 +
  1.1228 +        // sr = sr - (sr>>5) + d
  1.1229 +        sr = vsub_u8(sr, vshr_n_u8(sr, 5));
  1.1230 +        dr = vaddl_u8(sr, d);
  1.1231 +
  1.1232 +        // sb = sb - (sb>>5) + d
  1.1233 +        sb = vsub_u8(sb, vshr_n_u8(sb, 5));
  1.1234 +        db = vaddl_u8(sb, d);
  1.1235 +
  1.1236 +        // sg = sg - (sg>>6) + d>>1; similar logic for overflows
  1.1237 +        sg = vsub_u8(sg, vshr_n_u8(sg, 6));
  1.1238 +        dg = vaddl_u8(sg, vshr_n_u8(d, 1));
  1.1239 +
  1.1240 +        // pack high bits of each into 565 format  (rgb, b is lsb)
  1.1241 +        dst8 = vshrq_n_u16(db, 3);
  1.1242 +        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5);
  1.1243 +        dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11);
  1.1244 +
  1.1245 +        // store it
  1.1246 +        vst1q_u16(dst, dst8);
  1.1247 +
  1.1248 +#if    defined(DEBUG_S32_OPAQUE_DITHER)
  1.1249 +        // always good to know if we generated good results
  1.1250 +        {
  1.1251 +        int i, myx = x, myy = y;
  1.1252 +        DITHER_565_SCAN(myy);
  1.1253 +        for (i=0;i<UNROLL;i++) {
  1.1254 +            // the '!' in the asm block above post-incremented src by the 8 pixels it reads.
  1.1255 +            SkPMColor c = src[i-8];
  1.1256 +            unsigned dither = DITHER_VALUE(myx);
  1.1257 +            uint16_t val = SkDitherRGB32To565(c, dither);
  1.1258 +            if (val != dst[i]) {
  1.1259 +            SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n",
  1.1260 +                c, dither, val, dst[i], dstart[i]);
  1.1261 +            }
  1.1262 +            DITHER_INC_X(myx);
  1.1263 +        }
  1.1264 +        }
  1.1265 +#endif
  1.1266 +
  1.1267 +        dst += UNROLL;
  1.1268 +        // we don't need to increment src as the asm above has already done it
  1.1269 +        count -= UNROLL;
  1.1270 +        x += UNROLL;        // probably superfluous
  1.1271 +    }
  1.1272 +    }
  1.1273 +#undef    UNROLL
  1.1274 +
  1.1275 +    // residuals
  1.1276 +    if (count > 0) {
  1.1277 +        DITHER_565_SCAN(y);
  1.1278 +        do {
  1.1279 +            SkPMColor c = *src++;
  1.1280 +            SkPMColorAssert(c);
  1.1281 +            SkASSERT(SkGetPackedA32(c) == 255);
  1.1282 +
  1.1283 +            unsigned dither = DITHER_VALUE(x);
  1.1284 +            *dst++ = SkDitherRGB32To565(c, dither);
  1.1285 +            DITHER_INC_X(x);
  1.1286 +        } while (--count != 0);
  1.1287 +    }
  1.1288 +}
  1.1289 +
  1.1290 +void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count,
  1.1291 +                      SkPMColor color) {
  1.1292 +    if (count <= 0) {
  1.1293 +        return;
  1.1294 +    }
  1.1295 +
  1.1296 +    if (0 == color) {
  1.1297 +        if (src != dst) {
  1.1298 +            memcpy(dst, src, count * sizeof(SkPMColor));
  1.1299 +        }
  1.1300 +        return;
  1.1301 +    }
  1.1302 +
  1.1303 +    unsigned colorA = SkGetPackedA32(color);
  1.1304 +    if (255 == colorA) {
  1.1305 +        sk_memset32(dst, color, count);
  1.1306 +    } else {
  1.1307 +        unsigned scale = 256 - SkAlpha255To256(colorA);
  1.1308 +
  1.1309 +        if (count >= 8) {
  1.1310 +            // at the end of this assembly, count will have been decremented
  1.1311 +            // to a negative value. That is, if count mod 8 = x, it will be
  1.1312 +            // -8 +x coming out.
  1.1313 +            asm volatile (
  1.1314 +                PLD128(src, 0)
  1.1315 +
  1.1316 +                "vdup.32    q0, %[color]                \n\t"
  1.1317 +
  1.1318 +                PLD128(src, 128)
  1.1319 +
  1.1320 +                // scale numerical interval [0-255], so load as 8 bits
  1.1321 +                "vdup.8     d2, %[scale]                \n\t"
  1.1322 +
  1.1323 +                PLD128(src, 256)
  1.1324 +
  1.1325 +                "subs       %[count], %[count], #8      \n\t"
  1.1326 +
  1.1327 +                PLD128(src, 384)
  1.1328 +
  1.1329 +                "Loop_Color32:                          \n\t"
  1.1330 +
  1.1331 +                // load src color, 8 pixels, 4 64 bit registers
  1.1332 +                // (and increment src).
  1.1333 +                "vld1.32    {d4-d7}, [%[src]]!          \n\t"
  1.1334 +
  1.1335 +                PLD128(src, 384)
  1.1336 +
  1.1337 +                // multiply long by scale, 64 bits at a time,
  1.1338 +                // destination into a 128 bit register.
  1.1339 +                "vmull.u8   q4, d4, d2                  \n\t"
  1.1340 +                "vmull.u8   q5, d5, d2                  \n\t"
  1.1341 +                "vmull.u8   q6, d6, d2                  \n\t"
  1.1342 +                "vmull.u8   q7, d7, d2                  \n\t"
  1.1343 +
  1.1344 +                // shift the 128 bit registers, containing the 16
  1.1345 +                // bit scaled values back to 8 bits, narrowing the
  1.1346 +                // results to 64 bit registers.
  1.1347 +                "vshrn.i16  d8, q4, #8                  \n\t"
  1.1348 +                "vshrn.i16  d9, q5, #8                  \n\t"
  1.1349 +                "vshrn.i16  d10, q6, #8                 \n\t"
  1.1350 +                "vshrn.i16  d11, q7, #8                 \n\t"
  1.1351 +
  1.1352 +                // adding back the color, using 128 bit registers.
  1.1353 +                "vadd.i8    q6, q4, q0                  \n\t"
  1.1354 +                "vadd.i8    q7, q5, q0                  \n\t"
  1.1355 +
  1.1356 +                // store back the 8 calculated pixels (2 128 bit
  1.1357 +                // registers), and increment dst.
  1.1358 +                "vst1.32    {d12-d15}, [%[dst]]!        \n\t"
  1.1359 +
  1.1360 +                "subs       %[count], %[count], #8      \n\t"
  1.1361 +                "bge        Loop_Color32                \n\t"
  1.1362 +                : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count)
  1.1363 +                : [color] "r" (color), [scale] "r" (scale)
  1.1364 +                : "cc", "memory",
  1.1365 +                  "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7",
  1.1366 +                  "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15"
  1.1367 +                          );
  1.1368 +            // At this point, if we went through the inline assembly, count is
  1.1369 +            // a negative value:
  1.1370 +            // if the value is -8, there is no pixel left to process.
  1.1371 +            // if the value is -7, there is one pixel left to process
  1.1372 +            // ...
  1.1373 +            // And'ing it with 7 will give us the number of pixels
  1.1374 +            // left to process.
  1.1375 +            count = count & 0x7;
  1.1376 +        }
  1.1377 +
  1.1378 +        while (count > 0) {
  1.1379 +            *dst = color + SkAlphaMulQ(*src, scale);
  1.1380 +            src += 1;
  1.1381 +            dst += 1;
  1.1382 +            count--;
  1.1383 +        }
  1.1384 +    }
  1.1385 +}
  1.1386 +
  1.1387 +///////////////////////////////////////////////////////////////////////////////
  1.1388 +
  1.1389 +const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = {
  1.1390 +    // no dither
  1.1391 +    // NOTE: For the S32_D565_Blend function below, we don't have a special
  1.1392 +    //       version that assumes that each source pixel is opaque. But our
  1.1393 +    //       S32A is still faster than the default, so use it.
  1.1394 +    S32_D565_Opaque_neon,
  1.1395 +    S32A_D565_Blend_neon,   // really S32_D565_Blend
  1.1396 +    S32A_D565_Opaque_neon,
  1.1397 +    S32A_D565_Blend_neon,
  1.1398 +
  1.1399 +    // dither
  1.1400 +    S32_D565_Opaque_Dither_neon,
  1.1401 +    S32_D565_Blend_Dither_neon,
  1.1402 +    S32A_D565_Opaque_Dither_neon,
  1.1403 +    NULL,   // S32A_D565_Blend_Dither
  1.1404 +};
  1.1405 +
  1.1406 +const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = {
  1.1407 +    NULL,   // S32_Opaque,
  1.1408 +    S32_Blend_BlitRow32_neon,        // S32_Blend,
  1.1409 +    /*
  1.1410 +     * We have two choices for S32A_Opaque procs. The one reads the src alpha
  1.1411 +     * value and attempts to optimize accordingly.  The optimization is
  1.1412 +     * sensitive to the source content and is not a win in all cases. For
  1.1413 +     * example, if there are a lot of transitions between the alpha states,
  1.1414 +     * the performance will almost certainly be worse.  However, for many
  1.1415 +     * common cases the performance is equivalent or better than the standard
  1.1416 +     * case where we do not inspect the src alpha.
  1.1417 +     */
  1.1418 +#if SK_A32_SHIFT == 24
  1.1419 +    // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor
  1.1420 +    S32A_Opaque_BlitRow32_neon_src_alpha,   // S32A_Opaque,
  1.1421 +#else
  1.1422 +    S32A_Opaque_BlitRow32_neon,     // S32A_Opaque,
  1.1423 +#endif
  1.1424 +    S32A_Blend_BlitRow32_neon        // S32A_Blend
  1.1425 +};

mercurial