1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRow_opts_arm_neon.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1422 @@ 1.4 +/* 1.5 + * Copyright 2012 The Android Open Source Project 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license that can be 1.8 + * found in the LICENSE file. 1.9 + */ 1.10 + 1.11 +#include "SkBlitRow_opts_arm_neon.h" 1.12 + 1.13 +#include "SkBlitMask.h" 1.14 +#include "SkBlitRow.h" 1.15 +#include "SkColorPriv.h" 1.16 +#include "SkDither.h" 1.17 +#include "SkMathPriv.h" 1.18 +#include "SkUtils.h" 1.19 + 1.20 +#include "SkCachePreload_arm.h" 1.21 +#include "SkColor_opts_neon.h" 1.22 +#include <arm_neon.h> 1.23 + 1.24 +void S32_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 1.25 + const SkPMColor* SK_RESTRICT src, int count, 1.26 + U8CPU alpha, int /*x*/, int /*y*/) { 1.27 + SkASSERT(255 == alpha); 1.28 + 1.29 + while (count >= 8) { 1.30 + uint8x8x4_t vsrc; 1.31 + uint16x8_t vdst; 1.32 + 1.33 + // Load 1.34 + vsrc = vld4_u8((uint8_t*)src); 1.35 + 1.36 + // Convert src to 565 1.37 + vdst = SkPixel32ToPixel16_neon8(vsrc); 1.38 + 1.39 + // Store 1.40 + vst1q_u16(dst, vdst); 1.41 + 1.42 + // Prepare next iteration 1.43 + dst += 8; 1.44 + src += 8; 1.45 + count -= 8; 1.46 + }; 1.47 + 1.48 + // Leftovers 1.49 + while (count > 0) { 1.50 + SkPMColor c = *src++; 1.51 + SkPMColorAssert(c); 1.52 + *dst = SkPixel32ToPixel16_ToU16(c); 1.53 + dst++; 1.54 + count--; 1.55 + }; 1.56 +} 1.57 + 1.58 +void S32A_D565_Opaque_neon(uint16_t* SK_RESTRICT dst, 1.59 + const SkPMColor* SK_RESTRICT src, int count, 1.60 + U8CPU alpha, int /*x*/, int /*y*/) { 1.61 + SkASSERT(255 == alpha); 1.62 + 1.63 + if (count >= 8) { 1.64 + uint16_t* SK_RESTRICT keep_dst = 0; 1.65 + 1.66 + asm volatile ( 1.67 + "ands ip, %[count], #7 \n\t" 1.68 + "vmov.u8 d31, #1<<7 \n\t" 1.69 + "vld1.16 {q12}, [%[dst]] \n\t" 1.70 + "vld4.8 {d0-d3}, [%[src]] \n\t" 1.71 + // Thumb does not support the standard ARM conditional 1.72 + // instructions but instead requires the 'it' instruction 1.73 + // to signal conditional execution 1.74 + "it eq \n\t" 1.75 + "moveq ip, #8 \n\t" 1.76 + "mov %[keep_dst], %[dst] \n\t" 1.77 + 1.78 + "add %[src], %[src], ip, LSL#2 \n\t" 1.79 + "add %[dst], %[dst], ip, LSL#1 \n\t" 1.80 + "subs %[count], %[count], ip \n\t" 1.81 + "b 9f \n\t" 1.82 + // LOOP 1.83 + "2: \n\t" 1.84 + 1.85 + "vld1.16 {q12}, [%[dst]]! \n\t" 1.86 + "vld4.8 {d0-d3}, [%[src]]! \n\t" 1.87 + "vst1.16 {q10}, [%[keep_dst]] \n\t" 1.88 + "sub %[keep_dst], %[dst], #8*2 \n\t" 1.89 + "subs %[count], %[count], #8 \n\t" 1.90 + "9: \n\t" 1.91 + "pld [%[dst],#32] \n\t" 1.92 + // expand 0565 q12 to 8888 {d4-d7} 1.93 + "vmovn.u16 d4, q12 \n\t" 1.94 + "vshr.u16 q11, q12, #5 \n\t" 1.95 + "vshr.u16 q10, q12, #6+5 \n\t" 1.96 + "vmovn.u16 d5, q11 \n\t" 1.97 + "vmovn.u16 d6, q10 \n\t" 1.98 + "vshl.u8 d4, d4, #3 \n\t" 1.99 + "vshl.u8 d5, d5, #2 \n\t" 1.100 + "vshl.u8 d6, d6, #3 \n\t" 1.101 + 1.102 + "vmovl.u8 q14, d31 \n\t" 1.103 + "vmovl.u8 q13, d31 \n\t" 1.104 + "vmovl.u8 q12, d31 \n\t" 1.105 + 1.106 + // duplicate in 4/2/1 & 8pix vsns 1.107 + "vmvn.8 d30, d3 \n\t" 1.108 + "vmlal.u8 q14, d30, d6 \n\t" 1.109 + "vmlal.u8 q13, d30, d5 \n\t" 1.110 + "vmlal.u8 q12, d30, d4 \n\t" 1.111 + "vshr.u16 q8, q14, #5 \n\t" 1.112 + "vshr.u16 q9, q13, #6 \n\t" 1.113 + "vaddhn.u16 d6, q14, q8 \n\t" 1.114 + "vshr.u16 q8, q12, #5 \n\t" 1.115 + "vaddhn.u16 d5, q13, q9 \n\t" 1.116 + "vqadd.u8 d6, d6, d0 \n\t" // moved up 1.117 + "vaddhn.u16 d4, q12, q8 \n\t" 1.118 + // intentionally don't calculate alpha 1.119 + // result in d4-d6 1.120 + 1.121 + "vqadd.u8 d5, d5, d1 \n\t" 1.122 + "vqadd.u8 d4, d4, d2 \n\t" 1.123 + 1.124 + // pack 8888 {d4-d6} to 0565 q10 1.125 + "vshll.u8 q10, d6, #8 \n\t" 1.126 + "vshll.u8 q3, d5, #8 \n\t" 1.127 + "vshll.u8 q2, d4, #8 \n\t" 1.128 + "vsri.u16 q10, q3, #5 \n\t" 1.129 + "vsri.u16 q10, q2, #11 \n\t" 1.130 + 1.131 + "bne 2b \n\t" 1.132 + 1.133 + "1: \n\t" 1.134 + "vst1.16 {q10}, [%[keep_dst]] \n\t" 1.135 + : [count] "+r" (count) 1.136 + : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 1.137 + : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 1.138 + "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 1.139 + "d30","d31" 1.140 + ); 1.141 + } 1.142 + else 1.143 + { // handle count < 8 1.144 + uint16_t* SK_RESTRICT keep_dst = 0; 1.145 + 1.146 + asm volatile ( 1.147 + "vmov.u8 d31, #1<<7 \n\t" 1.148 + "mov %[keep_dst], %[dst] \n\t" 1.149 + 1.150 + "tst %[count], #4 \n\t" 1.151 + "beq 14f \n\t" 1.152 + "vld1.16 {d25}, [%[dst]]! \n\t" 1.153 + "vld1.32 {q1}, [%[src]]! \n\t" 1.154 + 1.155 + "14: \n\t" 1.156 + "tst %[count], #2 \n\t" 1.157 + "beq 12f \n\t" 1.158 + "vld1.32 {d24[1]}, [%[dst]]! \n\t" 1.159 + "vld1.32 {d1}, [%[src]]! \n\t" 1.160 + 1.161 + "12: \n\t" 1.162 + "tst %[count], #1 \n\t" 1.163 + "beq 11f \n\t" 1.164 + "vld1.16 {d24[1]}, [%[dst]]! \n\t" 1.165 + "vld1.32 {d0[1]}, [%[src]]! \n\t" 1.166 + 1.167 + "11: \n\t" 1.168 + // unzips achieve the same as a vld4 operation 1.169 + "vuzpq.u16 q0, q1 \n\t" 1.170 + "vuzp.u8 d0, d1 \n\t" 1.171 + "vuzp.u8 d2, d3 \n\t" 1.172 + // expand 0565 q12 to 8888 {d4-d7} 1.173 + "vmovn.u16 d4, q12 \n\t" 1.174 + "vshr.u16 q11, q12, #5 \n\t" 1.175 + "vshr.u16 q10, q12, #6+5 \n\t" 1.176 + "vmovn.u16 d5, q11 \n\t" 1.177 + "vmovn.u16 d6, q10 \n\t" 1.178 + "vshl.u8 d4, d4, #3 \n\t" 1.179 + "vshl.u8 d5, d5, #2 \n\t" 1.180 + "vshl.u8 d6, d6, #3 \n\t" 1.181 + 1.182 + "vmovl.u8 q14, d31 \n\t" 1.183 + "vmovl.u8 q13, d31 \n\t" 1.184 + "vmovl.u8 q12, d31 \n\t" 1.185 + 1.186 + // duplicate in 4/2/1 & 8pix vsns 1.187 + "vmvn.8 d30, d3 \n\t" 1.188 + "vmlal.u8 q14, d30, d6 \n\t" 1.189 + "vmlal.u8 q13, d30, d5 \n\t" 1.190 + "vmlal.u8 q12, d30, d4 \n\t" 1.191 + "vshr.u16 q8, q14, #5 \n\t" 1.192 + "vshr.u16 q9, q13, #6 \n\t" 1.193 + "vaddhn.u16 d6, q14, q8 \n\t" 1.194 + "vshr.u16 q8, q12, #5 \n\t" 1.195 + "vaddhn.u16 d5, q13, q9 \n\t" 1.196 + "vqadd.u8 d6, d6, d0 \n\t" // moved up 1.197 + "vaddhn.u16 d4, q12, q8 \n\t" 1.198 + // intentionally don't calculate alpha 1.199 + // result in d4-d6 1.200 + 1.201 + "vqadd.u8 d5, d5, d1 \n\t" 1.202 + "vqadd.u8 d4, d4, d2 \n\t" 1.203 + 1.204 + // pack 8888 {d4-d6} to 0565 q10 1.205 + "vshll.u8 q10, d6, #8 \n\t" 1.206 + "vshll.u8 q3, d5, #8 \n\t" 1.207 + "vshll.u8 q2, d4, #8 \n\t" 1.208 + "vsri.u16 q10, q3, #5 \n\t" 1.209 + "vsri.u16 q10, q2, #11 \n\t" 1.210 + 1.211 + // store 1.212 + "tst %[count], #4 \n\t" 1.213 + "beq 24f \n\t" 1.214 + "vst1.16 {d21}, [%[keep_dst]]! \n\t" 1.215 + 1.216 + "24: \n\t" 1.217 + "tst %[count], #2 \n\t" 1.218 + "beq 22f \n\t" 1.219 + "vst1.32 {d20[1]}, [%[keep_dst]]! \n\t" 1.220 + 1.221 + "22: \n\t" 1.222 + "tst %[count], #1 \n\t" 1.223 + "beq 21f \n\t" 1.224 + "vst1.16 {d20[1]}, [%[keep_dst]]! \n\t" 1.225 + 1.226 + "21: \n\t" 1.227 + : [count] "+r" (count) 1.228 + : [dst] "r" (dst), [keep_dst] "r" (keep_dst), [src] "r" (src) 1.229 + : "ip", "cc", "memory", "d0","d1","d2","d3","d4","d5","d6","d7", 1.230 + "d16","d17","d18","d19","d20","d21","d22","d23","d24","d25","d26","d27","d28","d29", 1.231 + "d30","d31" 1.232 + ); 1.233 + } 1.234 +} 1.235 + 1.236 +static inline uint16x8_t SkDiv255Round_neon8(uint16x8_t prod) { 1.237 + prod += vdupq_n_u16(128); 1.238 + prod += vshrq_n_u16(prod, 8); 1.239 + return vshrq_n_u16(prod, 8); 1.240 +} 1.241 + 1.242 +void S32A_D565_Blend_neon(uint16_t* SK_RESTRICT dst, 1.243 + const SkPMColor* SK_RESTRICT src, int count, 1.244 + U8CPU alpha, int /*x*/, int /*y*/) { 1.245 + SkASSERT(255 > alpha); 1.246 + 1.247 + /* This code implements a Neon version of S32A_D565_Blend. The results have 1.248 + * a few mismatches compared to the original code. These mismatches never 1.249 + * exceed 1. 1.250 + */ 1.251 + 1.252 + if (count >= 8) { 1.253 + uint16x8_t valpha_max, vmask_blue; 1.254 + uint8x8_t valpha; 1.255 + 1.256 + // prepare constants 1.257 + valpha_max = vmovq_n_u16(255); 1.258 + valpha = vdup_n_u8(alpha); 1.259 + vmask_blue = vmovq_n_u16(SK_B16_MASK); 1.260 + 1.261 + do { 1.262 + uint16x8_t vdst, vdst_r, vdst_g, vdst_b; 1.263 + uint16x8_t vres_a, vres_r, vres_g, vres_b; 1.264 + uint8x8x4_t vsrc; 1.265 + 1.266 + // load pixels 1.267 + vdst = vld1q_u16(dst); 1.268 +#if (__GNUC__ > 4) || ((__GNUC__ == 4) && (__GNUC_MINOR__ > 6)) 1.269 + asm ( 1.270 + "vld4.u8 %h[vsrc], [%[src]]!" 1.271 + : [vsrc] "=w" (vsrc), [src] "+&r" (src) 1.272 + : : 1.273 + ); 1.274 +#else 1.275 + register uint8x8_t d0 asm("d0"); 1.276 + register uint8x8_t d1 asm("d1"); 1.277 + register uint8x8_t d2 asm("d2"); 1.278 + register uint8x8_t d3 asm("d3"); 1.279 + 1.280 + asm volatile ( 1.281 + "vld4.u8 {d0-d3},[%[src]]!;" 1.282 + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), 1.283 + [src] "+&r" (src) 1.284 + : : 1.285 + ); 1.286 + vsrc.val[0] = d0; 1.287 + vsrc.val[1] = d1; 1.288 + vsrc.val[2] = d2; 1.289 + vsrc.val[3] = d3; 1.290 +#endif 1.291 + 1.292 + 1.293 + // deinterleave dst 1.294 + vdst_g = vshlq_n_u16(vdst, SK_R16_BITS); // shift green to top of lanes 1.295 + vdst_b = vdst & vmask_blue; // extract blue 1.296 + vdst_r = vshrq_n_u16(vdst, SK_R16_SHIFT); // extract red 1.297 + vdst_g = vshrq_n_u16(vdst_g, SK_R16_BITS + SK_B16_BITS); // extract green 1.298 + 1.299 + // shift src to 565 1.300 + vsrc.val[NEON_R] = vshr_n_u8(vsrc.val[NEON_R], 8 - SK_R16_BITS); 1.301 + vsrc.val[NEON_G] = vshr_n_u8(vsrc.val[NEON_G], 8 - SK_G16_BITS); 1.302 + vsrc.val[NEON_B] = vshr_n_u8(vsrc.val[NEON_B], 8 - SK_B16_BITS); 1.303 + 1.304 + // calc src * src_scale 1.305 + vres_a = vmull_u8(vsrc.val[NEON_A], valpha); 1.306 + vres_r = vmull_u8(vsrc.val[NEON_R], valpha); 1.307 + vres_g = vmull_u8(vsrc.val[NEON_G], valpha); 1.308 + vres_b = vmull_u8(vsrc.val[NEON_B], valpha); 1.309 + 1.310 + // prepare dst_scale 1.311 + vres_a = SkDiv255Round_neon8(vres_a); 1.312 + vres_a = valpha_max - vres_a; // 255 - (sa * src_scale) / 255 1.313 + 1.314 + // add dst * dst_scale to previous result 1.315 + vres_r = vmlaq_u16(vres_r, vdst_r, vres_a); 1.316 + vres_g = vmlaq_u16(vres_g, vdst_g, vres_a); 1.317 + vres_b = vmlaq_u16(vres_b, vdst_b, vres_a); 1.318 + 1.319 +#ifdef S32A_D565_BLEND_EXACT 1.320 + // It is possible to get exact results with this but it is slow, 1.321 + // even slower than C code in some cases 1.322 + vres_r = SkDiv255Round_neon8(vres_r); 1.323 + vres_g = SkDiv255Round_neon8(vres_g); 1.324 + vres_b = SkDiv255Round_neon8(vres_b); 1.325 +#else 1.326 + vres_r = vrshrq_n_u16(vres_r, 8); 1.327 + vres_g = vrshrq_n_u16(vres_g, 8); 1.328 + vres_b = vrshrq_n_u16(vres_b, 8); 1.329 +#endif 1.330 + // pack result 1.331 + vres_b = vsliq_n_u16(vres_b, vres_g, SK_G16_SHIFT); // insert green into blue 1.332 + vres_b = vsliq_n_u16(vres_b, vres_r, SK_R16_SHIFT); // insert red into green/blue 1.333 + 1.334 + // store 1.335 + vst1q_u16(dst, vres_b); 1.336 + dst += 8; 1.337 + count -= 8; 1.338 + } while (count >= 8); 1.339 + } 1.340 + 1.341 + // leftovers 1.342 + while (count-- > 0) { 1.343 + SkPMColor sc = *src++; 1.344 + if (sc) { 1.345 + uint16_t dc = *dst; 1.346 + unsigned dst_scale = 255 - SkMulDiv255Round(SkGetPackedA32(sc), alpha); 1.347 + unsigned dr = SkMulS16(SkPacked32ToR16(sc), alpha) + SkMulS16(SkGetPackedR16(dc), dst_scale); 1.348 + unsigned dg = SkMulS16(SkPacked32ToG16(sc), alpha) + SkMulS16(SkGetPackedG16(dc), dst_scale); 1.349 + unsigned db = SkMulS16(SkPacked32ToB16(sc), alpha) + SkMulS16(SkGetPackedB16(dc), dst_scale); 1.350 + *dst = SkPackRGB16(SkDiv255Round(dr), SkDiv255Round(dg), SkDiv255Round(db)); 1.351 + } 1.352 + dst += 1; 1.353 + } 1.354 +} 1.355 + 1.356 +/* dither matrix for Neon, derived from gDitherMatrix_3Bit_16. 1.357 + * each dither value is spaced out into byte lanes, and repeated 1.358 + * to allow an 8-byte load from offsets 0, 1, 2 or 3 from the 1.359 + * start of each row. 1.360 + */ 1.361 +static const uint8_t gDitherMatrix_Neon[48] = { 1.362 + 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 1.363 + 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 1.364 + 1, 5, 0, 4, 1, 5, 0, 4, 1, 5, 0, 4, 1.365 + 7, 3, 6, 2, 7, 3, 6, 2, 7, 3, 6, 2, 1.366 + 1.367 +}; 1.368 + 1.369 +void S32_D565_Blend_Dither_neon(uint16_t *dst, const SkPMColor *src, 1.370 + int count, U8CPU alpha, int x, int y) 1.371 +{ 1.372 + 1.373 + SkASSERT(255 > alpha); 1.374 + 1.375 + // rescale alpha to range 1 - 256 1.376 + int scale = SkAlpha255To256(alpha); 1.377 + 1.378 + if (count >= 8) { 1.379 + /* select row and offset for dither array */ 1.380 + const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1.381 + 1.382 + uint8x8_t vdither = vld1_u8(dstart); // load dither values 1.383 + uint8x8_t vdither_g = vshr_n_u8(vdither, 1); // calc. green dither values 1.384 + 1.385 + int16x8_t vscale = vdupq_n_s16(scale); // duplicate scale into neon reg 1.386 + uint16x8_t vmask_b = vdupq_n_u16(0x1F); // set up blue mask 1.387 + 1.388 + do { 1.389 + 1.390 + uint8x8_t vsrc_r, vsrc_g, vsrc_b; 1.391 + uint8x8_t vsrc565_r, vsrc565_g, vsrc565_b; 1.392 + uint16x8_t vsrc_dit_r, vsrc_dit_g, vsrc_dit_b; 1.393 + uint16x8_t vsrc_res_r, vsrc_res_g, vsrc_res_b; 1.394 + uint16x8_t vdst; 1.395 + uint16x8_t vdst_r, vdst_g, vdst_b; 1.396 + int16x8_t vres_r, vres_g, vres_b; 1.397 + int8x8_t vres8_r, vres8_g, vres8_b; 1.398 + 1.399 + // Load source and add dither 1.400 + { 1.401 + register uint8x8_t d0 asm("d0"); 1.402 + register uint8x8_t d1 asm("d1"); 1.403 + register uint8x8_t d2 asm("d2"); 1.404 + register uint8x8_t d3 asm("d3"); 1.405 + 1.406 + asm ( 1.407 + "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1.408 + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1.409 + : 1.410 + ); 1.411 + vsrc_g = d1; 1.412 +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1.413 + vsrc_r = d2; vsrc_b = d0; 1.414 +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1.415 + vsrc_r = d0; vsrc_b = d2; 1.416 +#endif 1.417 + } 1.418 + 1.419 + vsrc565_g = vshr_n_u8(vsrc_g, 6); // calc. green >> 6 1.420 + vsrc565_r = vshr_n_u8(vsrc_r, 5); // calc. red >> 5 1.421 + vsrc565_b = vshr_n_u8(vsrc_b, 5); // calc. blue >> 5 1.422 + 1.423 + vsrc_dit_g = vaddl_u8(vsrc_g, vdither_g); // add in dither to green and widen 1.424 + vsrc_dit_r = vaddl_u8(vsrc_r, vdither); // add in dither to red and widen 1.425 + vsrc_dit_b = vaddl_u8(vsrc_b, vdither); // add in dither to blue and widen 1.426 + 1.427 + vsrc_dit_r = vsubw_u8(vsrc_dit_r, vsrc565_r); // sub shifted red from result 1.428 + vsrc_dit_g = vsubw_u8(vsrc_dit_g, vsrc565_g); // sub shifted green from result 1.429 + vsrc_dit_b = vsubw_u8(vsrc_dit_b, vsrc565_b); // sub shifted blue from result 1.430 + 1.431 + vsrc_res_r = vshrq_n_u16(vsrc_dit_r, 3); 1.432 + vsrc_res_g = vshrq_n_u16(vsrc_dit_g, 2); 1.433 + vsrc_res_b = vshrq_n_u16(vsrc_dit_b, 3); 1.434 + 1.435 + // Load dst and unpack 1.436 + vdst = vld1q_u16(dst); 1.437 + vdst_g = vshrq_n_u16(vdst, 5); // shift down to get green 1.438 + vdst_r = vshrq_n_u16(vshlq_n_u16(vdst, 5), 5+5); // double shift to extract red 1.439 + vdst_b = vandq_u16(vdst, vmask_b); // mask to get blue 1.440 + 1.441 + // subtract dst from src and widen 1.442 + vres_r = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_r), vreinterpretq_s16_u16(vdst_r)); 1.443 + vres_g = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_g), vreinterpretq_s16_u16(vdst_g)); 1.444 + vres_b = vsubq_s16(vreinterpretq_s16_u16(vsrc_res_b), vreinterpretq_s16_u16(vdst_b)); 1.445 + 1.446 + // multiply diffs by scale and shift 1.447 + vres_r = vmulq_s16(vres_r, vscale); 1.448 + vres_g = vmulq_s16(vres_g, vscale); 1.449 + vres_b = vmulq_s16(vres_b, vscale); 1.450 + 1.451 + vres8_r = vshrn_n_s16(vres_r, 8); 1.452 + vres8_g = vshrn_n_s16(vres_g, 8); 1.453 + vres8_b = vshrn_n_s16(vres_b, 8); 1.454 + 1.455 + // add dst to result 1.456 + vres_r = vaddw_s8(vreinterpretq_s16_u16(vdst_r), vres8_r); 1.457 + vres_g = vaddw_s8(vreinterpretq_s16_u16(vdst_g), vres8_g); 1.458 + vres_b = vaddw_s8(vreinterpretq_s16_u16(vdst_b), vres8_b); 1.459 + 1.460 + // put result into 565 format 1.461 + vres_b = vsliq_n_s16(vres_b, vres_g, 5); // shift up green and insert into blue 1.462 + vres_b = vsliq_n_s16(vres_b, vres_r, 6+5); // shift up red and insert into blue 1.463 + 1.464 + // Store result 1.465 + vst1q_u16(dst, vreinterpretq_u16_s16(vres_b)); 1.466 + 1.467 + // Next iteration 1.468 + dst += 8; 1.469 + count -= 8; 1.470 + 1.471 + } while (count >= 8); 1.472 + } 1.473 + 1.474 + // Leftovers 1.475 + if (count > 0) { 1.476 + int scale = SkAlpha255To256(alpha); 1.477 + DITHER_565_SCAN(y); 1.478 + do { 1.479 + SkPMColor c = *src++; 1.480 + SkPMColorAssert(c); 1.481 + 1.482 + int dither = DITHER_VALUE(x); 1.483 + int sr = SkGetPackedR32(c); 1.484 + int sg = SkGetPackedG32(c); 1.485 + int sb = SkGetPackedB32(c); 1.486 + sr = SkDITHER_R32To565(sr, dither); 1.487 + sg = SkDITHER_G32To565(sg, dither); 1.488 + sb = SkDITHER_B32To565(sb, dither); 1.489 + 1.490 + uint16_t d = *dst; 1.491 + *dst++ = SkPackRGB16(SkAlphaBlend(sr, SkGetPackedR16(d), scale), 1.492 + SkAlphaBlend(sg, SkGetPackedG16(d), scale), 1.493 + SkAlphaBlend(sb, SkGetPackedB16(d), scale)); 1.494 + DITHER_INC_X(x); 1.495 + } while (--count != 0); 1.496 + } 1.497 +} 1.498 + 1.499 +void S32A_Opaque_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1.500 + const SkPMColor* SK_RESTRICT src, 1.501 + int count, U8CPU alpha) { 1.502 + 1.503 + SkASSERT(255 == alpha); 1.504 + if (count > 0) { 1.505 + 1.506 + 1.507 + uint8x8_t alpha_mask; 1.508 + 1.509 + static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1.510 + alpha_mask = vld1_u8(alpha_mask_setup); 1.511 + 1.512 + /* do the NEON unrolled code */ 1.513 +#define UNROLL 4 1.514 + while (count >= UNROLL) { 1.515 + uint8x8_t src_raw, dst_raw, dst_final; 1.516 + uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 1.517 + 1.518 + /* The two prefetches below may make the code slighlty 1.519 + * slower for small values of count but are worth having 1.520 + * in the general case. 1.521 + */ 1.522 + __builtin_prefetch(src+32); 1.523 + __builtin_prefetch(dst+32); 1.524 + 1.525 + /* get the source */ 1.526 + src_raw = vreinterpret_u8_u32(vld1_u32(src)); 1.527 +#if UNROLL > 2 1.528 + src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 1.529 +#endif 1.530 + 1.531 + /* get and hold the dst too */ 1.532 + dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 1.533 +#if UNROLL > 2 1.534 + dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 1.535 +#endif 1.536 + 1.537 + /* 1st and 2nd bits of the unrolling */ 1.538 + { 1.539 + uint8x8_t dst_cooked; 1.540 + uint16x8_t dst_wide; 1.541 + uint8x8_t alpha_narrow; 1.542 + uint16x8_t alpha_wide; 1.543 + 1.544 + /* get the alphas spread out properly */ 1.545 + alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 1.546 + alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 1.547 + 1.548 + /* spread the dest */ 1.549 + dst_wide = vmovl_u8(dst_raw); 1.550 + 1.551 + /* alpha mul the dest */ 1.552 + dst_wide = vmulq_u16 (dst_wide, alpha_wide); 1.553 + dst_cooked = vshrn_n_u16(dst_wide, 8); 1.554 + 1.555 + /* sum -- ignoring any byte lane overflows */ 1.556 + dst_final = vadd_u8(src_raw, dst_cooked); 1.557 + } 1.558 + 1.559 +#if UNROLL > 2 1.560 + /* the 3rd and 4th bits of our unrolling */ 1.561 + { 1.562 + uint8x8_t dst_cooked; 1.563 + uint16x8_t dst_wide; 1.564 + uint8x8_t alpha_narrow; 1.565 + uint16x8_t alpha_wide; 1.566 + 1.567 + alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 1.568 + alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 1.569 + 1.570 + /* spread the dest */ 1.571 + dst_wide = vmovl_u8(dst_raw_2); 1.572 + 1.573 + /* alpha mul the dest */ 1.574 + dst_wide = vmulq_u16 (dst_wide, alpha_wide); 1.575 + dst_cooked = vshrn_n_u16(dst_wide, 8); 1.576 + 1.577 + /* sum -- ignoring any byte lane overflows */ 1.578 + dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 1.579 + } 1.580 +#endif 1.581 + 1.582 + vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 1.583 +#if UNROLL > 2 1.584 + vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 1.585 +#endif 1.586 + 1.587 + src += UNROLL; 1.588 + dst += UNROLL; 1.589 + count -= UNROLL; 1.590 + } 1.591 +#undef UNROLL 1.592 + 1.593 + /* do any residual iterations */ 1.594 + while (--count >= 0) { 1.595 + *dst = SkPMSrcOver(*src, *dst); 1.596 + src += 1; 1.597 + dst += 1; 1.598 + } 1.599 + } 1.600 +} 1.601 + 1.602 +void S32A_Opaque_BlitRow32_neon_src_alpha(SkPMColor* SK_RESTRICT dst, 1.603 + const SkPMColor* SK_RESTRICT src, 1.604 + int count, U8CPU alpha) { 1.605 + SkASSERT(255 == alpha); 1.606 + 1.607 + if (count <= 0) 1.608 + return; 1.609 + 1.610 + /* Use these to check if src is transparent or opaque */ 1.611 + const unsigned int ALPHA_OPAQ = 0xFF000000; 1.612 + const unsigned int ALPHA_TRANS = 0x00FFFFFF; 1.613 + 1.614 +#define UNROLL 4 1.615 + const SkPMColor* SK_RESTRICT src_end = src + count - (UNROLL + 1); 1.616 + const SkPMColor* SK_RESTRICT src_temp = src; 1.617 + 1.618 + /* set up the NEON variables */ 1.619 + uint8x8_t alpha_mask; 1.620 + static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1.621 + alpha_mask = vld1_u8(alpha_mask_setup); 1.622 + 1.623 + uint8x8_t src_raw, dst_raw, dst_final; 1.624 + uint8x8_t src_raw_2, dst_raw_2, dst_final_2; 1.625 + uint8x8_t dst_cooked; 1.626 + uint16x8_t dst_wide; 1.627 + uint8x8_t alpha_narrow; 1.628 + uint16x8_t alpha_wide; 1.629 + 1.630 + /* choose the first processing type */ 1.631 + if( src >= src_end) 1.632 + goto TAIL; 1.633 + if(*src <= ALPHA_TRANS) 1.634 + goto ALPHA_0; 1.635 + if(*src >= ALPHA_OPAQ) 1.636 + goto ALPHA_255; 1.637 + /* fall-thru */ 1.638 + 1.639 +ALPHA_1_TO_254: 1.640 + do { 1.641 + 1.642 + /* get the source */ 1.643 + src_raw = vreinterpret_u8_u32(vld1_u32(src)); 1.644 + src_raw_2 = vreinterpret_u8_u32(vld1_u32(src+2)); 1.645 + 1.646 + /* get and hold the dst too */ 1.647 + dst_raw = vreinterpret_u8_u32(vld1_u32(dst)); 1.648 + dst_raw_2 = vreinterpret_u8_u32(vld1_u32(dst+2)); 1.649 + 1.650 + 1.651 + /* get the alphas spread out properly */ 1.652 + alpha_narrow = vtbl1_u8(src_raw, alpha_mask); 1.653 + /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 1.654 + /* we collapsed (255-a)+1 ... */ 1.655 + alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 1.656 + 1.657 + /* spread the dest */ 1.658 + dst_wide = vmovl_u8(dst_raw); 1.659 + 1.660 + /* alpha mul the dest */ 1.661 + dst_wide = vmulq_u16 (dst_wide, alpha_wide); 1.662 + dst_cooked = vshrn_n_u16(dst_wide, 8); 1.663 + 1.664 + /* sum -- ignoring any byte lane overflows */ 1.665 + dst_final = vadd_u8(src_raw, dst_cooked); 1.666 + 1.667 + alpha_narrow = vtbl1_u8(src_raw_2, alpha_mask); 1.668 + /* reflect SkAlpha255To256() semantics a+1 vs a+a>>7 */ 1.669 + /* we collapsed (255-a)+1 ... */ 1.670 + alpha_wide = vsubw_u8(vdupq_n_u16(256), alpha_narrow); 1.671 + 1.672 + /* spread the dest */ 1.673 + dst_wide = vmovl_u8(dst_raw_2); 1.674 + 1.675 + /* alpha mul the dest */ 1.676 + dst_wide = vmulq_u16 (dst_wide, alpha_wide); 1.677 + dst_cooked = vshrn_n_u16(dst_wide, 8); 1.678 + 1.679 + /* sum -- ignoring any byte lane overflows */ 1.680 + dst_final_2 = vadd_u8(src_raw_2, dst_cooked); 1.681 + 1.682 + vst1_u32(dst, vreinterpret_u32_u8(dst_final)); 1.683 + vst1_u32(dst+2, vreinterpret_u32_u8(dst_final_2)); 1.684 + 1.685 + src += UNROLL; 1.686 + dst += UNROLL; 1.687 + 1.688 + /* if 2 of the next pixels aren't between 1 and 254 1.689 + it might make sense to go to the optimized loops */ 1.690 + if((src[0] <= ALPHA_TRANS && src[1] <= ALPHA_TRANS) || (src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ)) 1.691 + break; 1.692 + 1.693 + } while(src < src_end); 1.694 + 1.695 + if (src >= src_end) 1.696 + goto TAIL; 1.697 + 1.698 + if(src[0] >= ALPHA_OPAQ && src[1] >= ALPHA_OPAQ) 1.699 + goto ALPHA_255; 1.700 + 1.701 + /*fall-thru*/ 1.702 + 1.703 +ALPHA_0: 1.704 + 1.705 + /*In this state, we know the current alpha is 0 and 1.706 + we optimize for the next alpha also being zero. */ 1.707 + src_temp = src; //so we don't have to increment dst every time 1.708 + do { 1.709 + if(*(++src) > ALPHA_TRANS) 1.710 + break; 1.711 + if(*(++src) > ALPHA_TRANS) 1.712 + break; 1.713 + if(*(++src) > ALPHA_TRANS) 1.714 + break; 1.715 + if(*(++src) > ALPHA_TRANS) 1.716 + break; 1.717 + } while(src < src_end); 1.718 + 1.719 + dst += (src - src_temp); 1.720 + 1.721 + /* no longer alpha 0, so determine where to go next. */ 1.722 + if( src >= src_end) 1.723 + goto TAIL; 1.724 + if(*src >= ALPHA_OPAQ) 1.725 + goto ALPHA_255; 1.726 + else 1.727 + goto ALPHA_1_TO_254; 1.728 + 1.729 +ALPHA_255: 1.730 + while((src[0] & src[1] & src[2] & src[3]) >= ALPHA_OPAQ) { 1.731 + dst[0]=src[0]; 1.732 + dst[1]=src[1]; 1.733 + dst[2]=src[2]; 1.734 + dst[3]=src[3]; 1.735 + src+=UNROLL; 1.736 + dst+=UNROLL; 1.737 + if(src >= src_end) 1.738 + goto TAIL; 1.739 + } 1.740 + 1.741 + //Handle remainder. 1.742 + if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 1.743 + if(*src >= ALPHA_OPAQ) { *dst++ = *src++; 1.744 + if(*src >= ALPHA_OPAQ) { *dst++ = *src++; } 1.745 + } 1.746 + } 1.747 + 1.748 + if( src >= src_end) 1.749 + goto TAIL; 1.750 + if(*src <= ALPHA_TRANS) 1.751 + goto ALPHA_0; 1.752 + else 1.753 + goto ALPHA_1_TO_254; 1.754 + 1.755 +TAIL: 1.756 + /* do any residual iterations */ 1.757 + src_end += UNROLL + 1; //goto the real end 1.758 + while(src != src_end) { 1.759 + if( *src != 0 ) { 1.760 + if( *src >= ALPHA_OPAQ ) { 1.761 + *dst = *src; 1.762 + } 1.763 + else { 1.764 + *dst = SkPMSrcOver(*src, *dst); 1.765 + } 1.766 + } 1.767 + src++; 1.768 + dst++; 1.769 + } 1.770 + 1.771 +#undef UNROLL 1.772 + return; 1.773 +} 1.774 + 1.775 +/* Neon version of S32_Blend_BlitRow32() 1.776 + * portable version is in src/core/SkBlitRow_D32.cpp 1.777 + */ 1.778 +void S32_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1.779 + const SkPMColor* SK_RESTRICT src, 1.780 + int count, U8CPU alpha) { 1.781 + SkASSERT(alpha <= 255); 1.782 + 1.783 + if (count <= 0) { 1.784 + return; 1.785 + } 1.786 + 1.787 + uint16_t src_scale = SkAlpha255To256(alpha); 1.788 + uint16_t dst_scale = 256 - src_scale; 1.789 + 1.790 + while (count >= 2) { 1.791 + uint8x8_t vsrc, vdst, vres; 1.792 + uint16x8_t vsrc_wide, vdst_wide; 1.793 + 1.794 + /* These commented prefetches are a big win for count 1.795 + * values > 64 on an A9 (Pandaboard) but hurt by 10% for count = 4. 1.796 + * They also hurt a little (<5%) on an A15 1.797 + */ 1.798 + //__builtin_prefetch(src+32); 1.799 + //__builtin_prefetch(dst+32); 1.800 + 1.801 + // Load 1.802 + vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1.803 + vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1.804 + 1.805 + // Process src 1.806 + vsrc_wide = vmovl_u8(vsrc); 1.807 + vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 1.808 + 1.809 + // Process dst 1.810 + vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 1.811 + 1.812 + // Combine 1.813 + vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1.814 + 1.815 + // Store 1.816 + vst1_u32(dst, vreinterpret_u32_u8(vres)); 1.817 + 1.818 + src += 2; 1.819 + dst += 2; 1.820 + count -= 2; 1.821 + } 1.822 + 1.823 + if (count == 1) { 1.824 + uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1.825 + uint16x8_t vsrc_wide, vdst_wide; 1.826 + 1.827 + // Load 1.828 + vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1.829 + vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1.830 + 1.831 + // Process 1.832 + vsrc_wide = vmovl_u8(vsrc); 1.833 + vsrc_wide = vmulq_u16(vsrc_wide, vdupq_n_u16(src_scale)); 1.834 + vdst_wide = vmull_u8(vdst, vdup_n_u8(dst_scale)); 1.835 + vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1.836 + 1.837 + // Store 1.838 + vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1.839 + } 1.840 +} 1.841 + 1.842 +void S32A_Blend_BlitRow32_neon(SkPMColor* SK_RESTRICT dst, 1.843 + const SkPMColor* SK_RESTRICT src, 1.844 + int count, U8CPU alpha) { 1.845 + 1.846 + SkASSERT(255 >= alpha); 1.847 + 1.848 + if (count <= 0) { 1.849 + return; 1.850 + } 1.851 + 1.852 + unsigned alpha256 = SkAlpha255To256(alpha); 1.853 + 1.854 + // First deal with odd counts 1.855 + if (count & 1) { 1.856 + uint8x8_t vsrc = vdup_n_u8(0), vdst = vdup_n_u8(0), vres; 1.857 + uint16x8_t vdst_wide, vsrc_wide; 1.858 + unsigned dst_scale; 1.859 + 1.860 + // Load 1.861 + vsrc = vreinterpret_u8_u32(vld1_lane_u32(src, vreinterpret_u32_u8(vsrc), 0)); 1.862 + vdst = vreinterpret_u8_u32(vld1_lane_u32(dst, vreinterpret_u32_u8(vdst), 0)); 1.863 + 1.864 + // Calc dst_scale 1.865 + dst_scale = vget_lane_u8(vsrc, 3); 1.866 + dst_scale *= alpha256; 1.867 + dst_scale >>= 8; 1.868 + dst_scale = 256 - dst_scale; 1.869 + 1.870 + // Process src 1.871 + vsrc_wide = vmovl_u8(vsrc); 1.872 + vsrc_wide = vmulq_n_u16(vsrc_wide, alpha256); 1.873 + 1.874 + // Process dst 1.875 + vdst_wide = vmovl_u8(vdst); 1.876 + vdst_wide = vmulq_n_u16(vdst_wide, dst_scale); 1.877 + 1.878 + // Combine 1.879 + vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1.880 + 1.881 + vst1_lane_u32(dst, vreinterpret_u32_u8(vres), 0); 1.882 + dst++; 1.883 + src++; 1.884 + count--; 1.885 + } 1.886 + 1.887 + if (count) { 1.888 + uint8x8_t alpha_mask; 1.889 + static const uint8_t alpha_mask_setup[] = {3,3,3,3,7,7,7,7}; 1.890 + alpha_mask = vld1_u8(alpha_mask_setup); 1.891 + 1.892 + do { 1.893 + 1.894 + uint8x8_t vsrc, vdst, vres, vsrc_alphas; 1.895 + uint16x8_t vdst_wide, vsrc_wide, vsrc_scale, vdst_scale; 1.896 + 1.897 + __builtin_prefetch(src+32); 1.898 + __builtin_prefetch(dst+32); 1.899 + 1.900 + // Load 1.901 + vsrc = vreinterpret_u8_u32(vld1_u32(src)); 1.902 + vdst = vreinterpret_u8_u32(vld1_u32(dst)); 1.903 + 1.904 + // Prepare src_scale 1.905 + vsrc_scale = vdupq_n_u16(alpha256); 1.906 + 1.907 + // Calc dst_scale 1.908 + vsrc_alphas = vtbl1_u8(vsrc, alpha_mask); 1.909 + vdst_scale = vmovl_u8(vsrc_alphas); 1.910 + vdst_scale *= vsrc_scale; 1.911 + vdst_scale = vshrq_n_u16(vdst_scale, 8); 1.912 + vdst_scale = vsubq_u16(vdupq_n_u16(256), vdst_scale); 1.913 + 1.914 + // Process src 1.915 + vsrc_wide = vmovl_u8(vsrc); 1.916 + vsrc_wide *= vsrc_scale; 1.917 + 1.918 + // Process dst 1.919 + vdst_wide = vmovl_u8(vdst); 1.920 + vdst_wide *= vdst_scale; 1.921 + 1.922 + // Combine 1.923 + vres = vshrn_n_u16(vdst_wide, 8) + vshrn_n_u16(vsrc_wide, 8); 1.924 + 1.925 + vst1_u32(dst, vreinterpret_u32_u8(vres)); 1.926 + 1.927 + src += 2; 1.928 + dst += 2; 1.929 + count -= 2; 1.930 + } while(count); 1.931 + } 1.932 +} 1.933 + 1.934 +/////////////////////////////////////////////////////////////////////////////// 1.935 + 1.936 +#undef DEBUG_OPAQUE_DITHER 1.937 + 1.938 +#if defined(DEBUG_OPAQUE_DITHER) 1.939 +static void showme8(char *str, void *p, int len) 1.940 +{ 1.941 + static char buf[256]; 1.942 + char tbuf[32]; 1.943 + int i; 1.944 + char *pc = (char*) p; 1.945 + sprintf(buf,"%8s:", str); 1.946 + for(i=0;i<len;i++) { 1.947 + sprintf(tbuf, " %02x", pc[i]); 1.948 + strcat(buf, tbuf); 1.949 + } 1.950 + SkDebugf("%s\n", buf); 1.951 +} 1.952 +static void showme16(char *str, void *p, int len) 1.953 +{ 1.954 + static char buf[256]; 1.955 + char tbuf[32]; 1.956 + int i; 1.957 + uint16_t *pc = (uint16_t*) p; 1.958 + sprintf(buf,"%8s:", str); 1.959 + len = (len / sizeof(uint16_t)); /* passed as bytes */ 1.960 + for(i=0;i<len;i++) { 1.961 + sprintf(tbuf, " %04x", pc[i]); 1.962 + strcat(buf, tbuf); 1.963 + } 1.964 + SkDebugf("%s\n", buf); 1.965 +} 1.966 +#endif 1.967 + 1.968 +void S32A_D565_Opaque_Dither_neon (uint16_t * SK_RESTRICT dst, 1.969 + const SkPMColor* SK_RESTRICT src, 1.970 + int count, U8CPU alpha, int x, int y) { 1.971 + SkASSERT(255 == alpha); 1.972 + 1.973 +#define UNROLL 8 1.974 + 1.975 + if (count >= UNROLL) { 1.976 + 1.977 +#if defined(DEBUG_OPAQUE_DITHER) 1.978 + uint16_t tmpbuf[UNROLL]; 1.979 + int td[UNROLL]; 1.980 + int tdv[UNROLL]; 1.981 + int ta[UNROLL]; 1.982 + int tap[UNROLL]; 1.983 + uint16_t in_dst[UNROLL]; 1.984 + int offset = 0; 1.985 + int noisy = 0; 1.986 +#endif 1.987 + 1.988 + uint8x8_t dbase; 1.989 + const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1.990 + dbase = vld1_u8(dstart); 1.991 + 1.992 + do { 1.993 + uint8x8_t sr, sg, sb, sa, d; 1.994 + uint16x8_t dst8, scale8, alpha8; 1.995 + uint16x8_t dst_r, dst_g, dst_b; 1.996 + 1.997 +#if defined(DEBUG_OPAQUE_DITHER) 1.998 + // calculate 8 elements worth into a temp buffer 1.999 + { 1.1000 + int my_y = y; 1.1001 + int my_x = x; 1.1002 + SkPMColor* my_src = (SkPMColor*)src; 1.1003 + uint16_t* my_dst = dst; 1.1004 + int i; 1.1005 + 1.1006 + DITHER_565_SCAN(my_y); 1.1007 + for(i = 0; i < UNROLL; i++) { 1.1008 + SkPMColor c = *my_src++; 1.1009 + SkPMColorAssert(c); 1.1010 + if (c) { 1.1011 + unsigned a = SkGetPackedA32(c); 1.1012 + 1.1013 + int d = SkAlphaMul(DITHER_VALUE(my_x), SkAlpha255To256(a)); 1.1014 + tdv[i] = DITHER_VALUE(my_x); 1.1015 + ta[i] = a; 1.1016 + tap[i] = SkAlpha255To256(a); 1.1017 + td[i] = d; 1.1018 + 1.1019 + unsigned sr = SkGetPackedR32(c); 1.1020 + unsigned sg = SkGetPackedG32(c); 1.1021 + unsigned sb = SkGetPackedB32(c); 1.1022 + sr = SkDITHER_R32_FOR_565(sr, d); 1.1023 + sg = SkDITHER_G32_FOR_565(sg, d); 1.1024 + sb = SkDITHER_B32_FOR_565(sb, d); 1.1025 + 1.1026 + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1.1027 + uint32_t dst_expanded = SkExpand_rgb_16(*my_dst); 1.1028 + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1.1029 + // now src and dst expanded are in g:11 r:10 x:1 b:10 1.1030 + tmpbuf[i] = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1.1031 + td[i] = d; 1.1032 + } else { 1.1033 + tmpbuf[i] = *my_dst; 1.1034 + ta[i] = tdv[i] = td[i] = 0xbeef; 1.1035 + } 1.1036 + in_dst[i] = *my_dst; 1.1037 + my_dst += 1; 1.1038 + DITHER_INC_X(my_x); 1.1039 + } 1.1040 + } 1.1041 +#endif 1.1042 + 1.1043 + 1.1044 + { 1.1045 + register uint8x8_t d0 asm("d0"); 1.1046 + register uint8x8_t d1 asm("d1"); 1.1047 + register uint8x8_t d2 asm("d2"); 1.1048 + register uint8x8_t d3 asm("d3"); 1.1049 + 1.1050 + asm ("vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1.1051 + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+r" (src) 1.1052 + : 1.1053 + ); 1.1054 +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1.1055 + sr = d2; sg = d1; sb = d0; sa = d3; 1.1056 +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1.1057 + sr = d0; sg = d1; sb = d2; sa = d3; 1.1058 +#endif 1.1059 + } 1.1060 + 1.1061 + /* calculate 'd', which will be 0..7 1.1062 + * dbase[] is 0..7; alpha is 0..256; 16 bits suffice 1.1063 + */ 1.1064 + alpha8 = vmovl_u8(dbase); 1.1065 + alpha8 = vmlal_u8(alpha8, sa, dbase); 1.1066 + d = vshrn_n_u16(alpha8, 8); // narrowing too 1.1067 + 1.1068 + // sr = sr - (sr>>5) + d 1.1069 + /* watching for 8-bit overflow. d is 0..7; risky range of 1.1070 + * sr is >248; and then (sr>>5) is 7 so it offsets 'd'; 1.1071 + * safe as long as we do ((sr-sr>>5) + d) 1.1072 + */ 1.1073 + sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1.1074 + sr = vadd_u8(sr, d); 1.1075 + 1.1076 + // sb = sb - (sb>>5) + d 1.1077 + sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1.1078 + sb = vadd_u8(sb, d); 1.1079 + 1.1080 + // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1.1081 + sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1.1082 + sg = vadd_u8(sg, vshr_n_u8(d,1)); 1.1083 + 1.1084 + // need to pick up 8 dst's -- at 16 bits each, 128 bits 1.1085 + dst8 = vld1q_u16(dst); 1.1086 + dst_b = vandq_u16(dst8, vdupq_n_u16(SK_B16_MASK)); 1.1087 + dst_g = vshrq_n_u16(vshlq_n_u16(dst8, SK_R16_BITS), SK_R16_BITS + SK_B16_BITS); 1.1088 + dst_r = vshrq_n_u16(dst8, SK_R16_SHIFT); // clearing hi bits 1.1089 + 1.1090 + // blend 1.1091 + scale8 = vsubw_u8(vdupq_n_u16(256), sa); 1.1092 + 1.1093 + // combine the addq and mul, save 3 insns 1.1094 + scale8 = vshrq_n_u16(scale8, 3); 1.1095 + dst_b = vmlaq_u16(vshll_n_u8(sb,2), dst_b, scale8); 1.1096 + dst_g = vmlaq_u16(vshll_n_u8(sg,3), dst_g, scale8); 1.1097 + dst_r = vmlaq_u16(vshll_n_u8(sr,2), dst_r, scale8); 1.1098 + 1.1099 + // repack to store 1.1100 + dst8 = vshrq_n_u16(dst_b, 5); 1.1101 + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_g, 5), 5); 1.1102 + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dst_r,5), 11); 1.1103 + 1.1104 + vst1q_u16(dst, dst8); 1.1105 + 1.1106 +#if defined(DEBUG_OPAQUE_DITHER) 1.1107 + // verify my 8 elements match the temp buffer 1.1108 + { 1.1109 + int i, bad=0; 1.1110 + static int invocation; 1.1111 + 1.1112 + for (i = 0; i < UNROLL; i++) { 1.1113 + if (tmpbuf[i] != dst[i]) { 1.1114 + bad=1; 1.1115 + } 1.1116 + } 1.1117 + if (bad) { 1.1118 + SkDebugf("BAD S32A_D565_Opaque_Dither_neon(); invocation %d offset %d\n", 1.1119 + invocation, offset); 1.1120 + SkDebugf(" alpha 0x%x\n", alpha); 1.1121 + for (i = 0; i < UNROLL; i++) 1.1122 + SkDebugf("%2d: %s %04x w %04x id %04x s %08x d %04x %04x %04x %04x\n", 1.1123 + i, ((tmpbuf[i] != dst[i])?"BAD":"got"), dst[i], tmpbuf[i], 1.1124 + in_dst[i], src[i-8], td[i], tdv[i], tap[i], ta[i]); 1.1125 + 1.1126 + showme16("alpha8", &alpha8, sizeof(alpha8)); 1.1127 + showme16("scale8", &scale8, sizeof(scale8)); 1.1128 + showme8("d", &d, sizeof(d)); 1.1129 + showme16("dst8", &dst8, sizeof(dst8)); 1.1130 + showme16("dst_b", &dst_b, sizeof(dst_b)); 1.1131 + showme16("dst_g", &dst_g, sizeof(dst_g)); 1.1132 + showme16("dst_r", &dst_r, sizeof(dst_r)); 1.1133 + showme8("sb", &sb, sizeof(sb)); 1.1134 + showme8("sg", &sg, sizeof(sg)); 1.1135 + showme8("sr", &sr, sizeof(sr)); 1.1136 + 1.1137 + return; 1.1138 + } 1.1139 + offset += UNROLL; 1.1140 + invocation++; 1.1141 + } 1.1142 +#endif 1.1143 + dst += UNROLL; 1.1144 + count -= UNROLL; 1.1145 + // skip x += UNROLL, since it's unchanged mod-4 1.1146 + } while (count >= UNROLL); 1.1147 + } 1.1148 +#undef UNROLL 1.1149 + 1.1150 + // residuals 1.1151 + if (count > 0) { 1.1152 + DITHER_565_SCAN(y); 1.1153 + do { 1.1154 + SkPMColor c = *src++; 1.1155 + SkPMColorAssert(c); 1.1156 + if (c) { 1.1157 + unsigned a = SkGetPackedA32(c); 1.1158 + 1.1159 + // dither and alpha are just temporary variables to work-around 1.1160 + // an ICE in debug. 1.1161 + unsigned dither = DITHER_VALUE(x); 1.1162 + unsigned alpha = SkAlpha255To256(a); 1.1163 + int d = SkAlphaMul(dither, alpha); 1.1164 + 1.1165 + unsigned sr = SkGetPackedR32(c); 1.1166 + unsigned sg = SkGetPackedG32(c); 1.1167 + unsigned sb = SkGetPackedB32(c); 1.1168 + sr = SkDITHER_R32_FOR_565(sr, d); 1.1169 + sg = SkDITHER_G32_FOR_565(sg, d); 1.1170 + sb = SkDITHER_B32_FOR_565(sb, d); 1.1171 + 1.1172 + uint32_t src_expanded = (sg << 24) | (sr << 13) | (sb << 2); 1.1173 + uint32_t dst_expanded = SkExpand_rgb_16(*dst); 1.1174 + dst_expanded = dst_expanded * (SkAlpha255To256(255 - a) >> 3); 1.1175 + // now src and dst expanded are in g:11 r:10 x:1 b:10 1.1176 + *dst = SkCompact_rgb_16((src_expanded + dst_expanded) >> 5); 1.1177 + } 1.1178 + dst += 1; 1.1179 + DITHER_INC_X(x); 1.1180 + } while (--count != 0); 1.1181 + } 1.1182 +} 1.1183 + 1.1184 +/////////////////////////////////////////////////////////////////////////////// 1.1185 + 1.1186 +#undef DEBUG_S32_OPAQUE_DITHER 1.1187 + 1.1188 +void S32_D565_Opaque_Dither_neon(uint16_t* SK_RESTRICT dst, 1.1189 + const SkPMColor* SK_RESTRICT src, 1.1190 + int count, U8CPU alpha, int x, int y) { 1.1191 + SkASSERT(255 == alpha); 1.1192 + 1.1193 +#define UNROLL 8 1.1194 + if (count >= UNROLL) { 1.1195 + uint8x8_t d; 1.1196 + const uint8_t *dstart = &gDitherMatrix_Neon[(y&3)*12 + (x&3)]; 1.1197 + d = vld1_u8(dstart); 1.1198 + 1.1199 + while (count >= UNROLL) { 1.1200 + uint8x8_t sr, sg, sb; 1.1201 + uint16x8_t dr, dg, db; 1.1202 + uint16x8_t dst8; 1.1203 + 1.1204 + { 1.1205 + register uint8x8_t d0 asm("d0"); 1.1206 + register uint8x8_t d1 asm("d1"); 1.1207 + register uint8x8_t d2 asm("d2"); 1.1208 + register uint8x8_t d3 asm("d3"); 1.1209 + 1.1210 + asm ( 1.1211 + "vld4.8 {d0-d3},[%[src]]! /* r=%P0 g=%P1 b=%P2 a=%P3 */" 1.1212 + : "=w" (d0), "=w" (d1), "=w" (d2), "=w" (d3), [src] "+&r" (src) 1.1213 + : 1.1214 + ); 1.1215 + sg = d1; 1.1216 +#if SK_PMCOLOR_BYTE_ORDER(B,G,R,A) 1.1217 + sr = d2; sb = d0; 1.1218 +#elif SK_PMCOLOR_BYTE_ORDER(R,G,B,A) 1.1219 + sr = d0; sb = d2; 1.1220 +#endif 1.1221 + } 1.1222 + /* XXX: if we want to prefetch, hide it in the above asm() 1.1223 + * using the gcc __builtin_prefetch(), the prefetch will 1.1224 + * fall to the bottom of the loop -- it won't stick up 1.1225 + * at the top of the loop, just after the vld4. 1.1226 + */ 1.1227 + 1.1228 + // sr = sr - (sr>>5) + d 1.1229 + sr = vsub_u8(sr, vshr_n_u8(sr, 5)); 1.1230 + dr = vaddl_u8(sr, d); 1.1231 + 1.1232 + // sb = sb - (sb>>5) + d 1.1233 + sb = vsub_u8(sb, vshr_n_u8(sb, 5)); 1.1234 + db = vaddl_u8(sb, d); 1.1235 + 1.1236 + // sg = sg - (sg>>6) + d>>1; similar logic for overflows 1.1237 + sg = vsub_u8(sg, vshr_n_u8(sg, 6)); 1.1238 + dg = vaddl_u8(sg, vshr_n_u8(d, 1)); 1.1239 + 1.1240 + // pack high bits of each into 565 format (rgb, b is lsb) 1.1241 + dst8 = vshrq_n_u16(db, 3); 1.1242 + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dg, 2), 5); 1.1243 + dst8 = vsliq_n_u16(dst8, vshrq_n_u16(dr, 3), 11); 1.1244 + 1.1245 + // store it 1.1246 + vst1q_u16(dst, dst8); 1.1247 + 1.1248 +#if defined(DEBUG_S32_OPAQUE_DITHER) 1.1249 + // always good to know if we generated good results 1.1250 + { 1.1251 + int i, myx = x, myy = y; 1.1252 + DITHER_565_SCAN(myy); 1.1253 + for (i=0;i<UNROLL;i++) { 1.1254 + // the '!' in the asm block above post-incremented src by the 8 pixels it reads. 1.1255 + SkPMColor c = src[i-8]; 1.1256 + unsigned dither = DITHER_VALUE(myx); 1.1257 + uint16_t val = SkDitherRGB32To565(c, dither); 1.1258 + if (val != dst[i]) { 1.1259 + SkDebugf("RBE: src %08x dither %02x, want %04x got %04x dbas[i] %02x\n", 1.1260 + c, dither, val, dst[i], dstart[i]); 1.1261 + } 1.1262 + DITHER_INC_X(myx); 1.1263 + } 1.1264 + } 1.1265 +#endif 1.1266 + 1.1267 + dst += UNROLL; 1.1268 + // we don't need to increment src as the asm above has already done it 1.1269 + count -= UNROLL; 1.1270 + x += UNROLL; // probably superfluous 1.1271 + } 1.1272 + } 1.1273 +#undef UNROLL 1.1274 + 1.1275 + // residuals 1.1276 + if (count > 0) { 1.1277 + DITHER_565_SCAN(y); 1.1278 + do { 1.1279 + SkPMColor c = *src++; 1.1280 + SkPMColorAssert(c); 1.1281 + SkASSERT(SkGetPackedA32(c) == 255); 1.1282 + 1.1283 + unsigned dither = DITHER_VALUE(x); 1.1284 + *dst++ = SkDitherRGB32To565(c, dither); 1.1285 + DITHER_INC_X(x); 1.1286 + } while (--count != 0); 1.1287 + } 1.1288 +} 1.1289 + 1.1290 +void Color32_arm_neon(SkPMColor* dst, const SkPMColor* src, int count, 1.1291 + SkPMColor color) { 1.1292 + if (count <= 0) { 1.1293 + return; 1.1294 + } 1.1295 + 1.1296 + if (0 == color) { 1.1297 + if (src != dst) { 1.1298 + memcpy(dst, src, count * sizeof(SkPMColor)); 1.1299 + } 1.1300 + return; 1.1301 + } 1.1302 + 1.1303 + unsigned colorA = SkGetPackedA32(color); 1.1304 + if (255 == colorA) { 1.1305 + sk_memset32(dst, color, count); 1.1306 + } else { 1.1307 + unsigned scale = 256 - SkAlpha255To256(colorA); 1.1308 + 1.1309 + if (count >= 8) { 1.1310 + // at the end of this assembly, count will have been decremented 1.1311 + // to a negative value. That is, if count mod 8 = x, it will be 1.1312 + // -8 +x coming out. 1.1313 + asm volatile ( 1.1314 + PLD128(src, 0) 1.1315 + 1.1316 + "vdup.32 q0, %[color] \n\t" 1.1317 + 1.1318 + PLD128(src, 128) 1.1319 + 1.1320 + // scale numerical interval [0-255], so load as 8 bits 1.1321 + "vdup.8 d2, %[scale] \n\t" 1.1322 + 1.1323 + PLD128(src, 256) 1.1324 + 1.1325 + "subs %[count], %[count], #8 \n\t" 1.1326 + 1.1327 + PLD128(src, 384) 1.1328 + 1.1329 + "Loop_Color32: \n\t" 1.1330 + 1.1331 + // load src color, 8 pixels, 4 64 bit registers 1.1332 + // (and increment src). 1.1333 + "vld1.32 {d4-d7}, [%[src]]! \n\t" 1.1334 + 1.1335 + PLD128(src, 384) 1.1336 + 1.1337 + // multiply long by scale, 64 bits at a time, 1.1338 + // destination into a 128 bit register. 1.1339 + "vmull.u8 q4, d4, d2 \n\t" 1.1340 + "vmull.u8 q5, d5, d2 \n\t" 1.1341 + "vmull.u8 q6, d6, d2 \n\t" 1.1342 + "vmull.u8 q7, d7, d2 \n\t" 1.1343 + 1.1344 + // shift the 128 bit registers, containing the 16 1.1345 + // bit scaled values back to 8 bits, narrowing the 1.1346 + // results to 64 bit registers. 1.1347 + "vshrn.i16 d8, q4, #8 \n\t" 1.1348 + "vshrn.i16 d9, q5, #8 \n\t" 1.1349 + "vshrn.i16 d10, q6, #8 \n\t" 1.1350 + "vshrn.i16 d11, q7, #8 \n\t" 1.1351 + 1.1352 + // adding back the color, using 128 bit registers. 1.1353 + "vadd.i8 q6, q4, q0 \n\t" 1.1354 + "vadd.i8 q7, q5, q0 \n\t" 1.1355 + 1.1356 + // store back the 8 calculated pixels (2 128 bit 1.1357 + // registers), and increment dst. 1.1358 + "vst1.32 {d12-d15}, [%[dst]]! \n\t" 1.1359 + 1.1360 + "subs %[count], %[count], #8 \n\t" 1.1361 + "bge Loop_Color32 \n\t" 1.1362 + : [src] "+r" (src), [dst] "+r" (dst), [count] "+r" (count) 1.1363 + : [color] "r" (color), [scale] "r" (scale) 1.1364 + : "cc", "memory", 1.1365 + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", 1.1366 + "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15" 1.1367 + ); 1.1368 + // At this point, if we went through the inline assembly, count is 1.1369 + // a negative value: 1.1370 + // if the value is -8, there is no pixel left to process. 1.1371 + // if the value is -7, there is one pixel left to process 1.1372 + // ... 1.1373 + // And'ing it with 7 will give us the number of pixels 1.1374 + // left to process. 1.1375 + count = count & 0x7; 1.1376 + } 1.1377 + 1.1378 + while (count > 0) { 1.1379 + *dst = color + SkAlphaMulQ(*src, scale); 1.1380 + src += 1; 1.1381 + dst += 1; 1.1382 + count--; 1.1383 + } 1.1384 + } 1.1385 +} 1.1386 + 1.1387 +/////////////////////////////////////////////////////////////////////////////// 1.1388 + 1.1389 +const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm_neon[] = { 1.1390 + // no dither 1.1391 + // NOTE: For the S32_D565_Blend function below, we don't have a special 1.1392 + // version that assumes that each source pixel is opaque. But our 1.1393 + // S32A is still faster than the default, so use it. 1.1394 + S32_D565_Opaque_neon, 1.1395 + S32A_D565_Blend_neon, // really S32_D565_Blend 1.1396 + S32A_D565_Opaque_neon, 1.1397 + S32A_D565_Blend_neon, 1.1398 + 1.1399 + // dither 1.1400 + S32_D565_Opaque_Dither_neon, 1.1401 + S32_D565_Blend_Dither_neon, 1.1402 + S32A_D565_Opaque_Dither_neon, 1.1403 + NULL, // S32A_D565_Blend_Dither 1.1404 +}; 1.1405 + 1.1406 +const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm_neon[] = { 1.1407 + NULL, // S32_Opaque, 1.1408 + S32_Blend_BlitRow32_neon, // S32_Blend, 1.1409 + /* 1.1410 + * We have two choices for S32A_Opaque procs. The one reads the src alpha 1.1411 + * value and attempts to optimize accordingly. The optimization is 1.1412 + * sensitive to the source content and is not a win in all cases. For 1.1413 + * example, if there are a lot of transitions between the alpha states, 1.1414 + * the performance will almost certainly be worse. However, for many 1.1415 + * common cases the performance is equivalent or better than the standard 1.1416 + * case where we do not inspect the src alpha. 1.1417 + */ 1.1418 +#if SK_A32_SHIFT == 24 1.1419 + // This proc assumes the alpha value occupies bits 24-32 of each SkPMColor 1.1420 + S32A_Opaque_BlitRow32_neon_src_alpha, // S32A_Opaque, 1.1421 +#else 1.1422 + S32A_Opaque_BlitRow32_neon, // S32A_Opaque, 1.1423 +#endif 1.1424 + S32A_Blend_BlitRow32_neon // S32A_Blend 1.1425 +};