gfx/skia/trunk/src/opts/SkXfermode_opts_arm_neon.cpp

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/SkXfermode_opts_arm_neon.cpp	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,937 @@
     1.4 +#include "SkXfermode.h"
     1.5 +#include "SkXfermode_proccoeff.h"
     1.6 +#include "SkColorPriv.h"
     1.7 +
     1.8 +#include <arm_neon.h>
     1.9 +#include "SkColor_opts_neon.h"
    1.10 +#include "SkXfermode_opts_arm_neon.h"
    1.11 +
    1.12 +#define SkAlphaMulAlpha(a, b)   SkMulDiv255Round(a, b)
    1.13 +
    1.14 +
    1.15 +////////////////////////////////////////////////////////////////////////////////
    1.16 +// NEONized skia functions
    1.17 +////////////////////////////////////////////////////////////////////////////////
    1.18 +
    1.19 +static inline uint8x8_t SkAlphaMulAlpha_neon8(uint8x8_t color, uint8x8_t alpha) {
    1.20 +    uint16x8_t tmp;
    1.21 +    uint8x8_t ret;
    1.22 +
    1.23 +    tmp = vmull_u8(color, alpha);
    1.24 +    tmp = vaddq_u16(tmp, vdupq_n_u16(128));
    1.25 +    tmp = vaddq_u16(tmp, vshrq_n_u16(tmp, 8));
    1.26 +
    1.27 +    ret = vshrn_n_u16(tmp, 8);
    1.28 +
    1.29 +    return ret;
    1.30 +}
    1.31 +
    1.32 +static inline uint16x8_t SkAlphaMulAlpha_neon8_16(uint8x8_t color, uint8x8_t alpha) {
    1.33 +    uint16x8_t ret;
    1.34 +
    1.35 +    ret = vmull_u8(color, alpha);
    1.36 +    ret = vaddq_u16(ret, vdupq_n_u16(128));
    1.37 +    ret = vaddq_u16(ret, vshrq_n_u16(ret, 8));
    1.38 +
    1.39 +    ret = vshrq_n_u16(ret, 8);
    1.40 +
    1.41 +    return ret;
    1.42 +}
    1.43 +
    1.44 +static inline uint8x8_t SkDiv255Round_neon8_32_8(int32x4_t p1, int32x4_t p2) {
    1.45 +    uint16x8_t tmp;
    1.46 +
    1.47 +    tmp = vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(p1)),
    1.48 +                       vmovn_u32(vreinterpretq_u32_s32(p2)));
    1.49 +
    1.50 +    tmp += vdupq_n_u16(128);
    1.51 +    tmp += vshrq_n_u16(tmp, 8);
    1.52 +
    1.53 +    return vshrn_n_u16(tmp, 8);
    1.54 +}
    1.55 +
    1.56 +static inline uint16x8_t SkDiv255Round_neon8_16_16(uint16x8_t prod) {
    1.57 +    prod += vdupq_n_u16(128);
    1.58 +    prod += vshrq_n_u16(prod, 8);
    1.59 +
    1.60 +    return vshrq_n_u16(prod, 8);
    1.61 +}
    1.62 +
    1.63 +static inline uint8x8_t clamp_div255round_simd8_32(int32x4_t val1, int32x4_t val2) {
    1.64 +    uint8x8_t ret;
    1.65 +    uint32x4_t cmp1, cmp2;
    1.66 +    uint16x8_t cmp16;
    1.67 +    uint8x8_t cmp8, cmp8_1;
    1.68 +
    1.69 +    // Test if <= 0
    1.70 +    cmp1 = vcleq_s32(val1, vdupq_n_s32(0));
    1.71 +    cmp2 = vcleq_s32(val2, vdupq_n_s32(0));
    1.72 +    cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
    1.73 +    cmp8_1 = vmovn_u16(cmp16);
    1.74 +
    1.75 +    // Init to zero
    1.76 +    ret = vdup_n_u8(0);
    1.77 +
    1.78 +    // Test if >= 255*255
    1.79 +    cmp1 = vcgeq_s32(val1, vdupq_n_s32(255*255));
    1.80 +    cmp2 = vcgeq_s32(val2, vdupq_n_s32(255*255));
    1.81 +    cmp16 = vcombine_u16(vmovn_u32(cmp1), vmovn_u32(cmp2));
    1.82 +    cmp8 = vmovn_u16(cmp16);
    1.83 +
    1.84 +    // Insert 255 where true
    1.85 +    ret = vbsl_u8(cmp8, vdup_n_u8(255), ret);
    1.86 +
    1.87 +    // Calc SkDiv255Round
    1.88 +    uint8x8_t div = SkDiv255Round_neon8_32_8(val1, val2);
    1.89 +
    1.90 +    // Insert where false and previous test false
    1.91 +    cmp8 = cmp8 | cmp8_1;
    1.92 +    ret = vbsl_u8(cmp8, ret, div);
    1.93 +
    1.94 +    // Return the final combination
    1.95 +    return ret;
    1.96 +}
    1.97 +
    1.98 +////////////////////////////////////////////////////////////////////////////////
    1.99 +// 1 pixel modeprocs
   1.100 +////////////////////////////////////////////////////////////////////////////////
   1.101 +
   1.102 +//  kSrcATop_Mode,  //!< [Da, Sc * Da + (1 - Sa) * Dc]
   1.103 +SkPMColor srcatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
   1.104 +    unsigned sa = SkGetPackedA32(src);
   1.105 +    unsigned da = SkGetPackedA32(dst);
   1.106 +    unsigned isa = 255 - sa;
   1.107 +
   1.108 +    uint8x8_t vda, visa, vsrc, vdst;
   1.109 +
   1.110 +    vda = vdup_n_u8(da);
   1.111 +    visa = vdup_n_u8(isa);
   1.112 +
   1.113 +    uint16x8_t vsrc_wide, vdst_wide;
   1.114 +    vsrc_wide = vmull_u8(vda, vreinterpret_u8_u32(vdup_n_u32(src)));
   1.115 +    vdst_wide = vmull_u8(visa, vreinterpret_u8_u32(vdup_n_u32(dst)));
   1.116 +
   1.117 +    vsrc_wide += vdupq_n_u16(128);
   1.118 +    vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
   1.119 +
   1.120 +    vdst_wide += vdupq_n_u16(128);
   1.121 +    vdst_wide += vshrq_n_u16(vdst_wide, 8);
   1.122 +
   1.123 +    vsrc = vshrn_n_u16(vsrc_wide, 8);
   1.124 +    vdst = vshrn_n_u16(vdst_wide, 8);
   1.125 +
   1.126 +    vsrc += vdst;
   1.127 +    vsrc = vset_lane_u8(da, vsrc, 3);
   1.128 +
   1.129 +    return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
   1.130 +}
   1.131 +
   1.132 +//  kDstATop_Mode,  //!< [Sa, Sa * Dc + Sc * (1 - Da)]
   1.133 +SkPMColor dstatop_modeproc_neon(SkPMColor src, SkPMColor dst) {
   1.134 +    unsigned sa = SkGetPackedA32(src);
   1.135 +    unsigned da = SkGetPackedA32(dst);
   1.136 +    unsigned ida = 255 - da;
   1.137 +
   1.138 +    uint8x8_t vsa, vida, vsrc, vdst;
   1.139 +
   1.140 +    vsa = vdup_n_u8(sa);
   1.141 +    vida = vdup_n_u8(ida);
   1.142 +
   1.143 +    uint16x8_t vsrc_wide, vdst_wide;
   1.144 +    vsrc_wide = vmull_u8(vida, vreinterpret_u8_u32(vdup_n_u32(src)));
   1.145 +    vdst_wide = vmull_u8(vsa, vreinterpret_u8_u32(vdup_n_u32(dst)));
   1.146 +
   1.147 +    vsrc_wide += vdupq_n_u16(128);
   1.148 +    vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
   1.149 +
   1.150 +    vdst_wide += vdupq_n_u16(128);
   1.151 +    vdst_wide += vshrq_n_u16(vdst_wide, 8);
   1.152 +
   1.153 +    vsrc = vshrn_n_u16(vsrc_wide, 8);
   1.154 +    vdst = vshrn_n_u16(vdst_wide, 8);
   1.155 +
   1.156 +    vsrc += vdst;
   1.157 +    vsrc = vset_lane_u8(sa, vsrc, 3);
   1.158 +
   1.159 +    return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
   1.160 +}
   1.161 +
   1.162 +//  kXor_Mode   [Sa + Da - 2 * Sa * Da, Sc * (1 - Da) + (1 - Sa) * Dc]
   1.163 +SkPMColor xor_modeproc_neon(SkPMColor src, SkPMColor dst) {
   1.164 +    unsigned sa = SkGetPackedA32(src);
   1.165 +    unsigned da = SkGetPackedA32(dst);
   1.166 +    unsigned ret_alpha = sa + da - (SkAlphaMulAlpha(sa, da) << 1);
   1.167 +    unsigned isa = 255 - sa;
   1.168 +    unsigned ida = 255 - da;
   1.169 +
   1.170 +    uint8x8_t vsrc, vdst, visa, vida;
   1.171 +    uint16x8_t vsrc_wide, vdst_wide;
   1.172 +
   1.173 +    visa = vdup_n_u8(isa);
   1.174 +    vida = vdup_n_u8(ida);
   1.175 +    vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
   1.176 +    vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
   1.177 +
   1.178 +    vsrc_wide = vmull_u8(vsrc, vida);
   1.179 +    vdst_wide = vmull_u8(vdst, visa);
   1.180 +
   1.181 +    vsrc_wide += vdupq_n_u16(128);
   1.182 +    vsrc_wide += vshrq_n_u16(vsrc_wide, 8);
   1.183 +
   1.184 +    vdst_wide += vdupq_n_u16(128);
   1.185 +    vdst_wide += vshrq_n_u16(vdst_wide, 8);
   1.186 +
   1.187 +    vsrc = vshrn_n_u16(vsrc_wide, 8);
   1.188 +    vdst = vshrn_n_u16(vdst_wide, 8);
   1.189 +
   1.190 +    vsrc += vdst;
   1.191 +
   1.192 +    vsrc = vset_lane_u8(ret_alpha, vsrc, 3);
   1.193 +
   1.194 +    return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
   1.195 +}
   1.196 +
   1.197 +// kPlus_Mode
   1.198 +SkPMColor plus_modeproc_neon(SkPMColor src, SkPMColor dst) {
   1.199 +    uint8x8_t vsrc, vdst;
   1.200 +    vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
   1.201 +    vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
   1.202 +    vsrc = vqadd_u8(vsrc, vdst);
   1.203 +
   1.204 +    return vget_lane_u32(vreinterpret_u32_u8(vsrc), 0);
   1.205 +}
   1.206 +
   1.207 +// kModulate_Mode
   1.208 +SkPMColor modulate_modeproc_neon(SkPMColor src, SkPMColor dst) {
   1.209 +    uint8x8_t vsrc, vdst, vres;
   1.210 +    uint16x8_t vres_wide;
   1.211 +
   1.212 +    vsrc = vreinterpret_u8_u32(vdup_n_u32(src));
   1.213 +    vdst = vreinterpret_u8_u32(vdup_n_u32(dst));
   1.214 +
   1.215 +    vres_wide = vmull_u8(vsrc, vdst);
   1.216 +
   1.217 +    vres_wide += vdupq_n_u16(128);
   1.218 +    vres_wide += vshrq_n_u16(vres_wide, 8);
   1.219 +
   1.220 +    vres = vshrn_n_u16(vres_wide, 8);
   1.221 +
   1.222 +    return vget_lane_u32(vreinterpret_u32_u8(vres), 0);
   1.223 +}
   1.224 +
   1.225 +////////////////////////////////////////////////////////////////////////////////
   1.226 +// 8 pixels modeprocs
   1.227 +////////////////////////////////////////////////////////////////////////////////
   1.228 +
   1.229 +uint8x8x4_t dstover_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.230 +    uint8x8x4_t ret;
   1.231 +    uint16x8_t src_scale;
   1.232 +
   1.233 +    src_scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
   1.234 +
   1.235 +    ret.val[NEON_A] = dst.val[NEON_A] + SkAlphaMul_neon8(src.val[NEON_A], src_scale);
   1.236 +    ret.val[NEON_R] = dst.val[NEON_R] + SkAlphaMul_neon8(src.val[NEON_R], src_scale);
   1.237 +    ret.val[NEON_G] = dst.val[NEON_G] + SkAlphaMul_neon8(src.val[NEON_G], src_scale);
   1.238 +    ret.val[NEON_B] = dst.val[NEON_B] + SkAlphaMul_neon8(src.val[NEON_B], src_scale);
   1.239 +
   1.240 +    return ret;
   1.241 +}
   1.242 +
   1.243 +uint8x8x4_t srcin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.244 +    uint8x8x4_t ret;
   1.245 +    uint16x8_t scale;
   1.246 +
   1.247 +    scale = SkAlpha255To256_neon8(dst.val[NEON_A]);
   1.248 +
   1.249 +    ret.val[NEON_A] = SkAlphaMul_neon8(src.val[NEON_A], scale);
   1.250 +    ret.val[NEON_R] = SkAlphaMul_neon8(src.val[NEON_R], scale);
   1.251 +    ret.val[NEON_G] = SkAlphaMul_neon8(src.val[NEON_G], scale);
   1.252 +    ret.val[NEON_B] = SkAlphaMul_neon8(src.val[NEON_B], scale);
   1.253 +
   1.254 +    return ret;
   1.255 +}
   1.256 +
   1.257 +uint8x8x4_t dstin_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.258 +    uint8x8x4_t ret;
   1.259 +    uint16x8_t scale;
   1.260 +
   1.261 +    scale = SkAlpha255To256_neon8(src.val[NEON_A]);
   1.262 +
   1.263 +    ret = SkAlphaMulQ_neon8(dst, scale);
   1.264 +
   1.265 +    return ret;
   1.266 +}
   1.267 +
   1.268 +uint8x8x4_t srcout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.269 +    uint8x8x4_t ret;
   1.270 +    uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), dst.val[NEON_A]);
   1.271 +
   1.272 +    ret = SkAlphaMulQ_neon8(src, scale);
   1.273 +
   1.274 +    return ret;
   1.275 +}
   1.276 +
   1.277 +uint8x8x4_t dstout_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.278 +    uint8x8x4_t ret;
   1.279 +    uint16x8_t scale = vsubw_u8(vdupq_n_u16(256), src.val[NEON_A]);
   1.280 +
   1.281 +    ret = SkAlphaMulQ_neon8(dst, scale);
   1.282 +
   1.283 +    return ret;
   1.284 +}
   1.285 +
   1.286 +uint8x8x4_t srcatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.287 +    uint8x8x4_t ret;
   1.288 +    uint8x8_t isa;
   1.289 +
   1.290 +    isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
   1.291 +
   1.292 +    ret.val[NEON_A] = dst.val[NEON_A];
   1.293 +    ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_A])
   1.294 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
   1.295 +    ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_A])
   1.296 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
   1.297 +    ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_A])
   1.298 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
   1.299 +
   1.300 +    return ret;
   1.301 +}
   1.302 +
   1.303 +uint8x8x4_t dstatop_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.304 +    uint8x8x4_t ret;
   1.305 +    uint8x8_t ida;
   1.306 +
   1.307 +    ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
   1.308 +
   1.309 +    ret.val[NEON_A] = src.val[NEON_A];
   1.310 +    ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
   1.311 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_R], src.val[NEON_A]);
   1.312 +    ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
   1.313 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_G], src.val[NEON_A]);
   1.314 +    ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
   1.315 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_B], src.val[NEON_A]);
   1.316 +
   1.317 +    return ret;
   1.318 +}
   1.319 +
   1.320 +uint8x8x4_t xor_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.321 +    uint8x8x4_t ret;
   1.322 +    uint8x8_t isa, ida;
   1.323 +    uint16x8_t tmp_wide, tmp_wide2;
   1.324 +
   1.325 +    isa = vsub_u8(vdup_n_u8(255), src.val[NEON_A]);
   1.326 +    ida = vsub_u8(vdup_n_u8(255), dst.val[NEON_A]);
   1.327 +
   1.328 +    // First calc alpha
   1.329 +    tmp_wide = vmovl_u8(src.val[NEON_A]);
   1.330 +    tmp_wide = vaddw_u8(tmp_wide, dst.val[NEON_A]);
   1.331 +    tmp_wide2 = vshll_n_u8(SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]), 1);
   1.332 +    tmp_wide = vsubq_u16(tmp_wide, tmp_wide2);
   1.333 +    ret.val[NEON_A] = vmovn_u16(tmp_wide);
   1.334 +
   1.335 +    // Then colors
   1.336 +    ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], ida)
   1.337 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_R], isa);
   1.338 +    ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], ida)
   1.339 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_G], isa);
   1.340 +    ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], ida)
   1.341 +                      + SkAlphaMulAlpha_neon8(dst.val[NEON_B], isa);
   1.342 +
   1.343 +    return ret;
   1.344 +}
   1.345 +
   1.346 +uint8x8x4_t plus_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.347 +    uint8x8x4_t ret;
   1.348 +
   1.349 +    ret.val[NEON_A] = vqadd_u8(src.val[NEON_A], dst.val[NEON_A]);
   1.350 +    ret.val[NEON_R] = vqadd_u8(src.val[NEON_R], dst.val[NEON_R]);
   1.351 +    ret.val[NEON_G] = vqadd_u8(src.val[NEON_G], dst.val[NEON_G]);
   1.352 +    ret.val[NEON_B] = vqadd_u8(src.val[NEON_B], dst.val[NEON_B]);
   1.353 +
   1.354 +    return ret;
   1.355 +}
   1.356 +
   1.357 +uint8x8x4_t modulate_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.358 +    uint8x8x4_t ret;
   1.359 +
   1.360 +    ret.val[NEON_A] = SkAlphaMulAlpha_neon8(src.val[NEON_A], dst.val[NEON_A]);
   1.361 +    ret.val[NEON_R] = SkAlphaMulAlpha_neon8(src.val[NEON_R], dst.val[NEON_R]);
   1.362 +    ret.val[NEON_G] = SkAlphaMulAlpha_neon8(src.val[NEON_G], dst.val[NEON_G]);
   1.363 +    ret.val[NEON_B] = SkAlphaMulAlpha_neon8(src.val[NEON_B], dst.val[NEON_B]);
   1.364 +
   1.365 +    return ret;
   1.366 +}
   1.367 +
   1.368 +static inline uint8x8_t srcover_color(uint8x8_t a, uint8x8_t b) {
   1.369 +    uint16x8_t tmp;
   1.370 +
   1.371 +    tmp = vaddl_u8(a, b);
   1.372 +    tmp -= SkAlphaMulAlpha_neon8_16(a, b);
   1.373 +
   1.374 +    return vmovn_u16(tmp);
   1.375 +}
   1.376 +
   1.377 +uint8x8x4_t screen_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.378 +    uint8x8x4_t ret;
   1.379 +
   1.380 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.381 +    ret.val[NEON_R] = srcover_color(src.val[NEON_R], dst.val[NEON_R]);
   1.382 +    ret.val[NEON_G] = srcover_color(src.val[NEON_G], dst.val[NEON_G]);
   1.383 +    ret.val[NEON_B] = srcover_color(src.val[NEON_B], dst.val[NEON_B]);
   1.384 +
   1.385 +    return ret;
   1.386 +}
   1.387 +
   1.388 +template <bool overlay>
   1.389 +static inline uint8x8_t overlay_hardlight_color(uint8x8_t sc, uint8x8_t dc,
   1.390 +                                               uint8x8_t sa, uint8x8_t da) {
   1.391 +    /*
   1.392 +     * In the end we're gonna use (rc + tmp) with a different rc
   1.393 +     * coming from an alternative.
   1.394 +     * The whole value (rc + tmp) can always be expressed as
   1.395 +     * VAL = COM - SUB in the if case
   1.396 +     * VAL = COM + SUB - sa*da in the else case
   1.397 +     *
   1.398 +     * with COM = 255 * (sc + dc)
   1.399 +     * and  SUB = sc*da + dc*sa - 2*dc*sc
   1.400 +     */
   1.401 +
   1.402 +    // Prepare common subexpressions
   1.403 +    uint16x8_t const255 = vdupq_n_u16(255);
   1.404 +    uint16x8_t sc_plus_dc = vaddl_u8(sc, dc);
   1.405 +    uint16x8_t scda = vmull_u8(sc, da);
   1.406 +    uint16x8_t dcsa = vmull_u8(dc, sa);
   1.407 +    uint16x8_t sada = vmull_u8(sa, da);
   1.408 +
   1.409 +    // Prepare non common subexpressions
   1.410 +    uint16x8_t dc2, sc2;
   1.411 +    uint32x4_t scdc2_1, scdc2_2;
   1.412 +    if (overlay) {
   1.413 +        dc2 = vshll_n_u8(dc, 1);
   1.414 +        scdc2_1 = vmull_u16(vget_low_u16(dc2), vget_low_u16(vmovl_u8(sc)));
   1.415 +        scdc2_2 = vmull_u16(vget_high_u16(dc2), vget_high_u16(vmovl_u8(sc)));
   1.416 +    } else {
   1.417 +        sc2 = vshll_n_u8(sc, 1);
   1.418 +        scdc2_1 = vmull_u16(vget_low_u16(sc2), vget_low_u16(vmovl_u8(dc)));
   1.419 +        scdc2_2 = vmull_u16(vget_high_u16(sc2), vget_high_u16(vmovl_u8(dc)));
   1.420 +    }
   1.421 +
   1.422 +    // Calc COM
   1.423 +    int32x4_t com1, com2;
   1.424 +    com1 = vreinterpretq_s32_u32(
   1.425 +                vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
   1.426 +    com2 = vreinterpretq_s32_u32(
   1.427 +                vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
   1.428 +
   1.429 +    // Calc SUB
   1.430 +    int32x4_t sub1, sub2;
   1.431 +    sub1 = vreinterpretq_s32_u32(vaddl_u16(vget_low_u16(scda), vget_low_u16(dcsa)));
   1.432 +    sub2 = vreinterpretq_s32_u32(vaddl_u16(vget_high_u16(scda), vget_high_u16(dcsa)));
   1.433 +    sub1 = vsubq_s32(sub1, vreinterpretq_s32_u32(scdc2_1));
   1.434 +    sub2 = vsubq_s32(sub2, vreinterpretq_s32_u32(scdc2_2));
   1.435 +
   1.436 +    // Compare 2*dc <= da
   1.437 +    uint16x8_t cmp;
   1.438 +
   1.439 +    if (overlay) {
   1.440 +        cmp = vcleq_u16(dc2, vmovl_u8(da));
   1.441 +    } else {
   1.442 +        cmp = vcleq_u16(sc2, vmovl_u8(sa));
   1.443 +    }
   1.444 +
   1.445 +    // Prepare variables
   1.446 +    int32x4_t val1_1, val1_2;
   1.447 +    int32x4_t val2_1, val2_2;
   1.448 +    uint32x4_t cmp1, cmp2;
   1.449 +
   1.450 +    cmp1 = vmovl_u16(vget_low_u16(cmp));
   1.451 +    cmp1 |= vshlq_n_u32(cmp1, 16);
   1.452 +    cmp2 = vmovl_u16(vget_high_u16(cmp));
   1.453 +    cmp2 |= vshlq_n_u32(cmp2, 16);
   1.454 +
   1.455 +    // Calc COM - SUB
   1.456 +    val1_1 = com1 - sub1;
   1.457 +    val1_2 = com2 - sub2;
   1.458 +
   1.459 +    // Calc COM + SUB - sa*da
   1.460 +    val2_1 = com1 + sub1;
   1.461 +    val2_2 = com2 + sub2;
   1.462 +
   1.463 +    val2_1 = vsubq_s32(val2_1, vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(sada))));
   1.464 +    val2_2 = vsubq_s32(val2_2, vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(sada))));
   1.465 +
   1.466 +    // Insert where needed
   1.467 +    val1_1 = vbslq_s32(cmp1, val1_1, val2_1);
   1.468 +    val1_2 = vbslq_s32(cmp2, val1_2, val2_2);
   1.469 +
   1.470 +    // Call the clamp_div255round function
   1.471 +    return clamp_div255round_simd8_32(val1_1, val1_2);
   1.472 +}
   1.473 +
   1.474 +static inline uint8x8_t overlay_color(uint8x8_t sc, uint8x8_t dc,
   1.475 +                                      uint8x8_t sa, uint8x8_t da) {
   1.476 +    return overlay_hardlight_color<true>(sc, dc, sa, da);
   1.477 +}
   1.478 +
   1.479 +uint8x8x4_t overlay_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.480 +    uint8x8x4_t ret;
   1.481 +
   1.482 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.483 +    ret.val[NEON_R] = overlay_color(src.val[NEON_R], dst.val[NEON_R],
   1.484 +                                    src.val[NEON_A], dst.val[NEON_A]);
   1.485 +    ret.val[NEON_G] = overlay_color(src.val[NEON_G], dst.val[NEON_G],
   1.486 +                                    src.val[NEON_A], dst.val[NEON_A]);
   1.487 +    ret.val[NEON_B] = overlay_color(src.val[NEON_B], dst.val[NEON_B],
   1.488 +                                    src.val[NEON_A], dst.val[NEON_A]);
   1.489 +
   1.490 +    return ret;
   1.491 +}
   1.492 +
   1.493 +template <bool lighten>
   1.494 +static inline uint8x8_t lighten_darken_color(uint8x8_t sc, uint8x8_t dc,
   1.495 +                                             uint8x8_t sa, uint8x8_t da) {
   1.496 +    uint16x8_t sd, ds, cmp, tmp, tmp2;
   1.497 +
   1.498 +    // Prepare
   1.499 +    sd = vmull_u8(sc, da);
   1.500 +    ds = vmull_u8(dc, sa);
   1.501 +
   1.502 +    // Do test
   1.503 +    if (lighten) {
   1.504 +        cmp = vcgtq_u16(sd, ds);
   1.505 +    } else {
   1.506 +        cmp = vcltq_u16(sd, ds);
   1.507 +    }
   1.508 +
   1.509 +    // Assign if
   1.510 +    tmp = vaddl_u8(sc, dc);
   1.511 +    tmp2 = tmp;
   1.512 +    tmp -= SkDiv255Round_neon8_16_16(ds);
   1.513 +
   1.514 +    // Calc else
   1.515 +    tmp2 -= SkDiv255Round_neon8_16_16(sd);
   1.516 +
   1.517 +    // Insert where needed
   1.518 +    tmp = vbslq_u16(cmp, tmp, tmp2);
   1.519 +
   1.520 +    return vmovn_u16(tmp);
   1.521 +}
   1.522 +
   1.523 +static inline uint8x8_t darken_color(uint8x8_t sc, uint8x8_t dc,
   1.524 +                                     uint8x8_t sa, uint8x8_t da) {
   1.525 +    return lighten_darken_color<false>(sc, dc, sa, da);
   1.526 +}
   1.527 +
   1.528 +uint8x8x4_t darken_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.529 +    uint8x8x4_t ret;
   1.530 +
   1.531 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.532 +    ret.val[NEON_R] = darken_color(src.val[NEON_R], dst.val[NEON_R],
   1.533 +                                   src.val[NEON_A], dst.val[NEON_A]);
   1.534 +    ret.val[NEON_G] = darken_color(src.val[NEON_G], dst.val[NEON_G],
   1.535 +                                   src.val[NEON_A], dst.val[NEON_A]);
   1.536 +    ret.val[NEON_B] = darken_color(src.val[NEON_B], dst.val[NEON_B],
   1.537 +                                   src.val[NEON_A], dst.val[NEON_A]);
   1.538 +
   1.539 +    return ret;
   1.540 +}
   1.541 +
   1.542 +static inline uint8x8_t lighten_color(uint8x8_t sc, uint8x8_t dc,
   1.543 +                                      uint8x8_t sa, uint8x8_t da) {
   1.544 +    return lighten_darken_color<true>(sc, dc, sa, da);
   1.545 +}
   1.546 +
   1.547 +uint8x8x4_t lighten_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.548 +    uint8x8x4_t ret;
   1.549 +
   1.550 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.551 +    ret.val[NEON_R] = lighten_color(src.val[NEON_R], dst.val[NEON_R],
   1.552 +                                    src.val[NEON_A], dst.val[NEON_A]);
   1.553 +    ret.val[NEON_G] = lighten_color(src.val[NEON_G], dst.val[NEON_G],
   1.554 +                                    src.val[NEON_A], dst.val[NEON_A]);
   1.555 +    ret.val[NEON_B] = lighten_color(src.val[NEON_B], dst.val[NEON_B],
   1.556 +                                    src.val[NEON_A], dst.val[NEON_A]);
   1.557 +
   1.558 +    return ret;
   1.559 +}
   1.560 +
   1.561 +static inline uint8x8_t hardlight_color(uint8x8_t sc, uint8x8_t dc,
   1.562 +                                        uint8x8_t sa, uint8x8_t da) {
   1.563 +    return overlay_hardlight_color<false>(sc, dc, sa, da);
   1.564 +}
   1.565 +
   1.566 +uint8x8x4_t hardlight_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.567 +    uint8x8x4_t ret;
   1.568 +
   1.569 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.570 +    ret.val[NEON_R] = hardlight_color(src.val[NEON_R], dst.val[NEON_R],
   1.571 +                                      src.val[NEON_A], dst.val[NEON_A]);
   1.572 +    ret.val[NEON_G] = hardlight_color(src.val[NEON_G], dst.val[NEON_G],
   1.573 +                                      src.val[NEON_A], dst.val[NEON_A]);
   1.574 +    ret.val[NEON_B] = hardlight_color(src.val[NEON_B], dst.val[NEON_B],
   1.575 +                                      src.val[NEON_A], dst.val[NEON_A]);
   1.576 +
   1.577 +    return ret;
   1.578 +}
   1.579 +
   1.580 +static inline uint8x8_t difference_color(uint8x8_t sc, uint8x8_t dc,
   1.581 +                                         uint8x8_t sa, uint8x8_t da) {
   1.582 +    uint16x8_t sd, ds, tmp;
   1.583 +    int16x8_t val;
   1.584 +
   1.585 +    sd = vmull_u8(sc, da);
   1.586 +    ds = vmull_u8(dc, sa);
   1.587 +
   1.588 +    tmp = vminq_u16(sd, ds);
   1.589 +    tmp = SkDiv255Round_neon8_16_16(tmp);
   1.590 +    tmp = vshlq_n_u16(tmp, 1);
   1.591 +
   1.592 +    val = vreinterpretq_s16_u16(vaddl_u8(sc, dc));
   1.593 +
   1.594 +    val -= vreinterpretq_s16_u16(tmp);
   1.595 +
   1.596 +    val = vmaxq_s16(val, vdupq_n_s16(0));
   1.597 +    val = vminq_s16(val, vdupq_n_s16(255));
   1.598 +
   1.599 +    return vmovn_u16(vreinterpretq_u16_s16(val));
   1.600 +}
   1.601 +
   1.602 +uint8x8x4_t difference_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.603 +    uint8x8x4_t ret;
   1.604 +
   1.605 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.606 +    ret.val[NEON_R] = difference_color(src.val[NEON_R], dst.val[NEON_R],
   1.607 +                                       src.val[NEON_A], dst.val[NEON_A]);
   1.608 +    ret.val[NEON_G] = difference_color(src.val[NEON_G], dst.val[NEON_G],
   1.609 +                                       src.val[NEON_A], dst.val[NEON_A]);
   1.610 +    ret.val[NEON_B] = difference_color(src.val[NEON_B], dst.val[NEON_B],
   1.611 +                                       src.val[NEON_A], dst.val[NEON_A]);
   1.612 +
   1.613 +    return ret;
   1.614 +}
   1.615 +
   1.616 +static inline uint8x8_t exclusion_color(uint8x8_t sc, uint8x8_t dc,
   1.617 +                                        uint8x8_t sa, uint8x8_t da) {
   1.618 +    /* The equation can be simplified to 255(sc + dc) - 2 * sc * dc */
   1.619 +
   1.620 +    uint16x8_t sc_plus_dc, scdc, const255;
   1.621 +    int32x4_t term1_1, term1_2, term2_1, term2_2;
   1.622 +
   1.623 +    /* Calc (sc + dc) and (sc * dc) */
   1.624 +    sc_plus_dc = vaddl_u8(sc, dc);
   1.625 +    scdc = vmull_u8(sc, dc);
   1.626 +
   1.627 +    /* Prepare constants */
   1.628 +    const255 = vdupq_n_u16(255);
   1.629 +
   1.630 +    /* Calc the first term */
   1.631 +    term1_1 = vreinterpretq_s32_u32(
   1.632 +                vmull_u16(vget_low_u16(const255), vget_low_u16(sc_plus_dc)));
   1.633 +    term1_2 = vreinterpretq_s32_u32(
   1.634 +                vmull_u16(vget_high_u16(const255), vget_high_u16(sc_plus_dc)));
   1.635 +
   1.636 +    /* Calc the second term */
   1.637 +    term2_1 = vreinterpretq_s32_u32(vshll_n_u16(vget_low_u16(scdc), 1));
   1.638 +    term2_2 = vreinterpretq_s32_u32(vshll_n_u16(vget_high_u16(scdc), 1));
   1.639 +
   1.640 +    return clamp_div255round_simd8_32(term1_1 - term2_1, term1_2 - term2_2);
   1.641 +}
   1.642 +
   1.643 +uint8x8x4_t exclusion_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.644 +    uint8x8x4_t ret;
   1.645 +
   1.646 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.647 +    ret.val[NEON_R] = exclusion_color(src.val[NEON_R], dst.val[NEON_R],
   1.648 +                                      src.val[NEON_A], dst.val[NEON_A]);
   1.649 +    ret.val[NEON_G] = exclusion_color(src.val[NEON_G], dst.val[NEON_G],
   1.650 +                                      src.val[NEON_A], dst.val[NEON_A]);
   1.651 +    ret.val[NEON_B] = exclusion_color(src.val[NEON_B], dst.val[NEON_B],
   1.652 +                                      src.val[NEON_A], dst.val[NEON_A]);
   1.653 +
   1.654 +    return ret;
   1.655 +}
   1.656 +
   1.657 +static inline uint8x8_t blendfunc_multiply_color(uint8x8_t sc, uint8x8_t dc,
   1.658 +                                                 uint8x8_t sa, uint8x8_t da) {
   1.659 +    uint32x4_t val1, val2;
   1.660 +    uint16x8_t scdc, t1, t2;
   1.661 +
   1.662 +    t1 = vmull_u8(sc, vdup_n_u8(255) - da);
   1.663 +    t2 = vmull_u8(dc, vdup_n_u8(255) - sa);
   1.664 +    scdc = vmull_u8(sc, dc);
   1.665 +
   1.666 +    val1 = vaddl_u16(vget_low_u16(t1), vget_low_u16(t2));
   1.667 +    val2 = vaddl_u16(vget_high_u16(t1), vget_high_u16(t2));
   1.668 +
   1.669 +    val1 = vaddw_u16(val1, vget_low_u16(scdc));
   1.670 +    val2 = vaddw_u16(val2, vget_high_u16(scdc));
   1.671 +
   1.672 +    return clamp_div255round_simd8_32(
   1.673 +                vreinterpretq_s32_u32(val1), vreinterpretq_s32_u32(val2));
   1.674 +}
   1.675 +
   1.676 +uint8x8x4_t multiply_modeproc_neon8(uint8x8x4_t src, uint8x8x4_t dst) {
   1.677 +    uint8x8x4_t ret;
   1.678 +
   1.679 +    ret.val[NEON_A] = srcover_color(src.val[NEON_A], dst.val[NEON_A]);
   1.680 +    ret.val[NEON_R] = blendfunc_multiply_color(src.val[NEON_R], dst.val[NEON_R],
   1.681 +                                               src.val[NEON_A], dst.val[NEON_A]);
   1.682 +    ret.val[NEON_G] = blendfunc_multiply_color(src.val[NEON_G], dst.val[NEON_G],
   1.683 +                                               src.val[NEON_A], dst.val[NEON_A]);
   1.684 +    ret.val[NEON_B] = blendfunc_multiply_color(src.val[NEON_B], dst.val[NEON_B],
   1.685 +                                               src.val[NEON_A], dst.val[NEON_A]);
   1.686 +
   1.687 +    return ret;
   1.688 +}
   1.689 +
   1.690 +////////////////////////////////////////////////////////////////////////////////
   1.691 +
   1.692 +typedef uint8x8x4_t (*SkXfermodeProcSIMD)(uint8x8x4_t src, uint8x8x4_t dst);
   1.693 +
   1.694 +extern SkXfermodeProcSIMD gNEONXfermodeProcs[];
   1.695 +
   1.696 +SkNEONProcCoeffXfermode::SkNEONProcCoeffXfermode(SkReadBuffer& buffer)
   1.697 +        : INHERITED(buffer) {
   1.698 +    fProcSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[this->getMode()]);
   1.699 +}
   1.700 +
   1.701 +void SkNEONProcCoeffXfermode::xfer32(SkPMColor dst[], const SkPMColor src[],
   1.702 +                                     int count, const SkAlpha aa[]) const {
   1.703 +    SkASSERT(dst && src && count >= 0);
   1.704 +
   1.705 +    SkXfermodeProc proc = this->getProc();
   1.706 +    SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
   1.707 +    SkASSERT(procSIMD != NULL);
   1.708 +
   1.709 +    if (NULL == aa) {
   1.710 +        // Unrolled NEON code
   1.711 +        while (count >= 8) {
   1.712 +            uint8x8x4_t vsrc, vdst, vres;
   1.713 +
   1.714 +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
   1.715 +            asm volatile (
   1.716 +                "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
   1.717 +                "vld4.u8    %h[vdst], [%[dst]]   \t\n"
   1.718 +                : [vsrc] "=w" (vsrc), [vdst] "=w" (vdst), [src] "+&r" (src)
   1.719 +                : [dst] "r" (dst)
   1.720 +                :
   1.721 +            );
   1.722 +#else
   1.723 +            register uint8x8_t d0 asm("d0");
   1.724 +            register uint8x8_t d1 asm("d1");
   1.725 +            register uint8x8_t d2 asm("d2");
   1.726 +            register uint8x8_t d3 asm("d3");
   1.727 +            register uint8x8_t d4 asm("d4");
   1.728 +            register uint8x8_t d5 asm("d5");
   1.729 +            register uint8x8_t d6 asm("d6");
   1.730 +            register uint8x8_t d7 asm("d7");
   1.731 +
   1.732 +            asm volatile (
   1.733 +                "vld4.u8    {d0-d3},[%[src]]!;"
   1.734 +                "vld4.u8    {d4-d7},[%[dst]];"
   1.735 +                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
   1.736 +                  "=w" (d4), "=w" (d5), "=w" (d6), "=w" (d7),
   1.737 +                  [src] "+&r" (src)
   1.738 +                : [dst] "r" (dst)
   1.739 +                :
   1.740 +            );
   1.741 +            vsrc.val[0] = d0; vdst.val[0] = d4;
   1.742 +            vsrc.val[1] = d1; vdst.val[1] = d5;
   1.743 +            vsrc.val[2] = d2; vdst.val[2] = d6;
   1.744 +            vsrc.val[3] = d3; vdst.val[3] = d7;
   1.745 +#endif
   1.746 +
   1.747 +            vres = procSIMD(vsrc, vdst);
   1.748 +
   1.749 +            vst4_u8((uint8_t*)dst, vres);
   1.750 +
   1.751 +            count -= 8;
   1.752 +            dst += 8;
   1.753 +        }
   1.754 +        // Leftovers
   1.755 +        for (int i = 0; i < count; i++) {
   1.756 +            dst[i] = proc(src[i], dst[i]);
   1.757 +        }
   1.758 +    } else {
   1.759 +        for (int i = count - 1; i >= 0; --i) {
   1.760 +            unsigned a = aa[i];
   1.761 +            if (0 != a) {
   1.762 +                SkPMColor dstC = dst[i];
   1.763 +                SkPMColor C = proc(src[i], dstC);
   1.764 +                if (a != 0xFF) {
   1.765 +                    C = SkFourByteInterp_neon(C, dstC, a);
   1.766 +                }
   1.767 +                dst[i] = C;
   1.768 +            }
   1.769 +        }
   1.770 +    }
   1.771 +}
   1.772 +
   1.773 +void SkNEONProcCoeffXfermode::xfer16(uint16_t* SK_RESTRICT dst,
   1.774 +                                     const SkPMColor* SK_RESTRICT src, int count,
   1.775 +                                     const SkAlpha* SK_RESTRICT aa) const {
   1.776 +    SkASSERT(dst && src && count >= 0);
   1.777 +
   1.778 +    SkXfermodeProc proc = this->getProc();
   1.779 +    SkXfermodeProcSIMD procSIMD = reinterpret_cast<SkXfermodeProcSIMD>(fProcSIMD);
   1.780 +    SkASSERT(procSIMD != NULL);
   1.781 +
   1.782 +    if (NULL == aa) {
   1.783 +        while(count >= 8) {
   1.784 +            uint16x8_t vdst, vres16;
   1.785 +            uint8x8x4_t vdst32, vsrc, vres;
   1.786 +
   1.787 +            vdst = vld1q_u16(dst);
   1.788 +
   1.789 +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6))
   1.790 +            asm volatile (
   1.791 +                "vld4.u8    %h[vsrc], [%[src]]!  \t\n"
   1.792 +                : [vsrc] "=w" (vsrc), [src] "+&r" (src)
   1.793 +                : :
   1.794 +            );
   1.795 +#else
   1.796 +            register uint8x8_t d0 asm("d0");
   1.797 +            register uint8x8_t d1 asm("d1");
   1.798 +            register uint8x8_t d2 asm("d2");
   1.799 +            register uint8x8_t d3 asm("d3");
   1.800 +
   1.801 +            asm volatile (
   1.802 +                "vld4.u8    {d0-d3},[%[src]]!;"
   1.803 +                : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3),
   1.804 +                  [src] "+&r" (src)
   1.805 +                : :
   1.806 +            );
   1.807 +            vsrc.val[0] = d0;
   1.808 +            vsrc.val[1] = d1;
   1.809 +            vsrc.val[2] = d2;
   1.810 +            vsrc.val[3] = d3;
   1.811 +#endif
   1.812 +
   1.813 +            vdst32 = SkPixel16ToPixel32_neon8(vdst);
   1.814 +            vres = procSIMD(vsrc, vdst32);
   1.815 +            vres16 = SkPixel32ToPixel16_neon8(vres);
   1.816 +
   1.817 +            vst1q_u16(dst, vres16);
   1.818 +
   1.819 +            count -= 8;
   1.820 +            dst += 8;
   1.821 +        }
   1.822 +        for (int i = 0; i < count; i++) {
   1.823 +            SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
   1.824 +            dst[i] = SkPixel32ToPixel16_ToU16(proc(src[i], dstC));
   1.825 +        }
   1.826 +    } else {
   1.827 +        for (int i = count - 1; i >= 0; --i) {
   1.828 +            unsigned a = aa[i];
   1.829 +            if (0 != a) {
   1.830 +                SkPMColor dstC = SkPixel16ToPixel32(dst[i]);
   1.831 +                SkPMColor C = proc(src[i], dstC);
   1.832 +                if (0xFF != a) {
   1.833 +                    C = SkFourByteInterp_neon(C, dstC, a);
   1.834 +                }
   1.835 +                dst[i] = SkPixel32ToPixel16_ToU16(C);
   1.836 +            }
   1.837 +        }
   1.838 +    }
   1.839 +}
   1.840 +
   1.841 +#ifndef SK_IGNORE_TO_STRING
   1.842 +void SkNEONProcCoeffXfermode::toString(SkString* str) const {
   1.843 +    this->INHERITED::toString(str);
   1.844 +}
   1.845 +#endif
   1.846 +
   1.847 +////////////////////////////////////////////////////////////////////////////////
   1.848 +
   1.849 +SkXfermodeProcSIMD gNEONXfermodeProcs[] = {
   1.850 +    NULL, // kClear_Mode
   1.851 +    NULL, // kSrc_Mode
   1.852 +    NULL, // kDst_Mode
   1.853 +    NULL, // kSrcOver_Mode
   1.854 +    dstover_modeproc_neon8,
   1.855 +    srcin_modeproc_neon8,
   1.856 +    dstin_modeproc_neon8,
   1.857 +    srcout_modeproc_neon8,
   1.858 +    dstout_modeproc_neon8,
   1.859 +    srcatop_modeproc_neon8,
   1.860 +    dstatop_modeproc_neon8,
   1.861 +    xor_modeproc_neon8,
   1.862 +    plus_modeproc_neon8,
   1.863 +    modulate_modeproc_neon8,
   1.864 +    screen_modeproc_neon8,
   1.865 +
   1.866 +    overlay_modeproc_neon8,
   1.867 +    darken_modeproc_neon8,
   1.868 +    lighten_modeproc_neon8,
   1.869 +    NULL, // kColorDodge_Mode
   1.870 +    NULL, // kColorBurn_Mode
   1.871 +    hardlight_modeproc_neon8,
   1.872 +    NULL, // kSoftLight_Mode
   1.873 +    difference_modeproc_neon8,
   1.874 +    exclusion_modeproc_neon8,
   1.875 +    multiply_modeproc_neon8,
   1.876 +
   1.877 +    NULL, // kHue_Mode
   1.878 +    NULL, // kSaturation_Mode
   1.879 +    NULL, // kColor_Mode
   1.880 +    NULL, // kLuminosity_Mode
   1.881 +};
   1.882 +
   1.883 +SK_COMPILE_ASSERT(
   1.884 +    SK_ARRAY_COUNT(gNEONXfermodeProcs) == SkXfermode::kLastMode + 1,
   1.885 +    mode_count_arm
   1.886 +);
   1.887 +
   1.888 +SkXfermodeProc gNEONXfermodeProcs1[] = {
   1.889 +    NULL, // kClear_Mode
   1.890 +    NULL, // kSrc_Mode
   1.891 +    NULL, // kDst_Mode
   1.892 +    NULL, // kSrcOver_Mode
   1.893 +    NULL, // kDstOver_Mode
   1.894 +    NULL, // kSrcIn_Mode
   1.895 +    NULL, // kDstIn_Mode
   1.896 +    NULL, // kSrcOut_Mode
   1.897 +    NULL, // kDstOut_Mode
   1.898 +    srcatop_modeproc_neon,
   1.899 +    dstatop_modeproc_neon,
   1.900 +    xor_modeproc_neon,
   1.901 +    plus_modeproc_neon,
   1.902 +    modulate_modeproc_neon,
   1.903 +    NULL, // kScreen_Mode
   1.904 +
   1.905 +    NULL, // kOverlay_Mode
   1.906 +    NULL, // kDarken_Mode
   1.907 +    NULL, // kLighten_Mode
   1.908 +    NULL, // kColorDodge_Mode
   1.909 +    NULL, // kColorBurn_Mode
   1.910 +    NULL, // kHardLight_Mode
   1.911 +    NULL, // kSoftLight_Mode
   1.912 +    NULL, // kDifference_Mode
   1.913 +    NULL, // kExclusion_Mode
   1.914 +    NULL, // kMultiply_Mode
   1.915 +
   1.916 +    NULL, // kHue_Mode
   1.917 +    NULL, // kSaturation_Mode
   1.918 +    NULL, // kColor_Mode
   1.919 +    NULL, // kLuminosity_Mode
   1.920 +};
   1.921 +
   1.922 +SK_COMPILE_ASSERT(
   1.923 +    SK_ARRAY_COUNT(gNEONXfermodeProcs1) == SkXfermode::kLastMode + 1,
   1.924 +    mode1_count_arm
   1.925 +);
   1.926 +
   1.927 +SkProcCoeffXfermode* SkPlatformXfermodeFactory_impl_neon(const ProcCoeff& rec,
   1.928 +                                                         SkXfermode::Mode mode) {
   1.929 +
   1.930 +    void* procSIMD = reinterpret_cast<void*>(gNEONXfermodeProcs[mode]);
   1.931 +
   1.932 +    if (procSIMD != NULL) {
   1.933 +        return SkNEW_ARGS(SkNEONProcCoeffXfermode, (rec, mode, procSIMD));
   1.934 +    }
   1.935 +    return NULL;
   1.936 +}
   1.937 +
   1.938 +SkXfermodeProc SkPlatformXfermodeProcFactory_impl_neon(SkXfermode::Mode mode) {
   1.939 +    return gNEONXfermodeProcs1[mode];
   1.940 +}

mercurial