1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/SkBlitRow_opts_arm.cpp Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,394 @@ 1.4 +/* 1.5 + * Copyright 2012 The Android Open Source Project 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license that can be 1.8 + * found in the LICENSE file. 1.9 + */ 1.10 + 1.11 +#include "SkBlitRow.h" 1.12 +#include "SkColorPriv.h" 1.13 +#include "SkDither.h" 1.14 +#include "SkMathPriv.h" 1.15 +#include "SkUtils.h" 1.16 +#include "SkUtilsArm.h" 1.17 + 1.18 +#include "SkCachePreload_arm.h" 1.19 + 1.20 +// Define USE_NEON_CODE to indicate that we need to build NEON routines 1.21 +#define USE_NEON_CODE (!SK_ARM_NEON_IS_NONE) 1.22 + 1.23 +// Define USE_ARM_CODE to indicate that we need to build ARM routines 1.24 +#define USE_ARM_CODE (!SK_ARM_NEON_IS_ALWAYS) 1.25 + 1.26 +#if USE_NEON_CODE 1.27 + #include "SkBlitRow_opts_arm_neon.h" 1.28 +#endif 1.29 + 1.30 +#if USE_ARM_CODE 1.31 + 1.32 +static void S32A_D565_Opaque(uint16_t* SK_RESTRICT dst, 1.33 + const SkPMColor* SK_RESTRICT src, int count, 1.34 + U8CPU alpha, int /*x*/, int /*y*/) { 1.35 + SkASSERT(255 == alpha); 1.36 + 1.37 + asm volatile ( 1.38 + "1: \n\t" 1.39 + "ldr r3, [%[src]], #4 \n\t" 1.40 + "cmp r3, #0xff000000 \n\t" 1.41 + "blo 2f \n\t" 1.42 + "and r4, r3, #0x0000f8 \n\t" 1.43 + "and r5, r3, #0x00fc00 \n\t" 1.44 + "and r6, r3, #0xf80000 \n\t" 1.45 +#ifdef SK_ARM_HAS_EDSP 1.46 + "pld [r1, #32] \n\t" 1.47 +#endif 1.48 + "lsl r3, r4, #8 \n\t" 1.49 + "orr r3, r3, r5, lsr #5 \n\t" 1.50 + "orr r3, r3, r6, lsr #19 \n\t" 1.51 + "subs %[count], %[count], #1 \n\t" 1.52 + "strh r3, [%[dst]], #2 \n\t" 1.53 + "bne 1b \n\t" 1.54 + "b 4f \n\t" 1.55 + "2: \n\t" 1.56 + "lsrs r7, r3, #24 \n\t" 1.57 + "beq 3f \n\t" 1.58 + "ldrh r4, [%[dst]] \n\t" 1.59 + "rsb r7, r7, #255 \n\t" 1.60 + "and r6, r4, #0x001f \n\t" 1.61 +#if SK_ARM_ARCH <= 6 1.62 + "lsl r5, r4, #21 \n\t" 1.63 + "lsr r5, r5, #26 \n\t" 1.64 +#else 1.65 + "ubfx r5, r4, #5, #6 \n\t" 1.66 +#endif 1.67 +#ifdef SK_ARM_HAS_EDSP 1.68 + "pld [r0, #16] \n\t" 1.69 +#endif 1.70 + "lsr r4, r4, #11 \n\t" 1.71 +#ifdef SK_ARM_HAS_EDSP 1.72 + "smulbb r6, r6, r7 \n\t" 1.73 + "smulbb r5, r5, r7 \n\t" 1.74 + "smulbb r4, r4, r7 \n\t" 1.75 +#else 1.76 + "mul r6, r6, r7 \n\t" 1.77 + "mul r5, r5, r7 \n\t" 1.78 + "mul r4, r4, r7 \n\t" 1.79 +#endif 1.80 +#if SK_ARM_ARCH >= 6 1.81 + "uxtb r7, r3, ROR #16 \n\t" 1.82 + "uxtb ip, r3, ROR #8 \n\t" 1.83 +#else 1.84 + "mov ip, #0xff \n\t" 1.85 + "and r7, ip, r3, ROR #16 \n\t" 1.86 + "and ip, ip, r3, ROR #8 \n\t" 1.87 +#endif 1.88 + "and r3, r3, #0xff \n\t" 1.89 + "add r6, r6, #16 \n\t" 1.90 + "add r5, r5, #32 \n\t" 1.91 + "add r4, r4, #16 \n\t" 1.92 + "add r6, r6, r6, lsr #5 \n\t" 1.93 + "add r5, r5, r5, lsr #6 \n\t" 1.94 + "add r4, r4, r4, lsr #5 \n\t" 1.95 + "add r6, r7, r6, lsr #5 \n\t" 1.96 + "add r5, ip, r5, lsr #6 \n\t" 1.97 + "add r4, r3, r4, lsr #5 \n\t" 1.98 + "lsr r6, r6, #3 \n\t" 1.99 + "and r5, r5, #0xfc \n\t" 1.100 + "and r4, r4, #0xf8 \n\t" 1.101 + "orr r6, r6, r5, lsl #3 \n\t" 1.102 + "orr r4, r6, r4, lsl #8 \n\t" 1.103 + "strh r4, [%[dst]], #2 \n\t" 1.104 +#ifdef SK_ARM_HAS_EDSP 1.105 + "pld [r1, #32] \n\t" 1.106 +#endif 1.107 + "subs %[count], %[count], #1 \n\t" 1.108 + "bne 1b \n\t" 1.109 + "b 4f \n\t" 1.110 + "3: \n\t" 1.111 + "subs %[count], %[count], #1 \n\t" 1.112 + "add %[dst], %[dst], #2 \n\t" 1.113 + "bne 1b \n\t" 1.114 + "4: \n\t" 1.115 + : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 1.116 + : 1.117 + : "memory", "cc", "r3", "r4", "r5", "r6", "r7", "ip" 1.118 + ); 1.119 +} 1.120 + 1.121 +static void S32A_Opaque_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 1.122 + const SkPMColor* SK_RESTRICT src, 1.123 + int count, U8CPU alpha) { 1.124 + 1.125 + SkASSERT(255 == alpha); 1.126 + 1.127 + asm volatile ( 1.128 + "cmp %[count], #0 \n\t" /* comparing count with 0 */ 1.129 + "beq 3f \n\t" /* if zero exit */ 1.130 + 1.131 + "mov ip, #0xff \n\t" /* load the 0xff mask in ip */ 1.132 + "orr ip, ip, ip, lsl #16 \n\t" /* convert it to 0xff00ff in ip */ 1.133 + 1.134 + "cmp %[count], #2 \n\t" /* compare count with 2 */ 1.135 + "blt 2f \n\t" /* if less than 2 -> single loop */ 1.136 + 1.137 + /* Double Loop */ 1.138 + "1: \n\t" /* <double loop> */ 1.139 + "ldm %[src]!, {r5,r6} \n\t" /* load the src(s) at r5-r6 */ 1.140 + "ldm %[dst], {r7,r8} \n\t" /* loading dst(s) into r7-r8 */ 1.141 + "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 1.142 + 1.143 + /* ----------- */ 1.144 + "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 1.145 + "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 1.146 + "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 1.147 + 1.148 + "mul r9, r9, r4 \n\t" /* br = br * scale */ 1.149 + "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 1.150 + "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 1.151 + 1.152 + "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 1.153 + "lsr r4, r6, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 1.154 + "orr r7, r9, r10 \n\t" /* br | ag*/ 1.155 + 1.156 + "add r7, r5, r7 \n\t" /* dst = src + calc dest(r7) */ 1.157 + "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 255 -> r4=scale */ 1.158 + 1.159 + /* ----------- */ 1.160 + "and r9, ip, r8 \n\t" /* r9 = br masked by ip */ 1.161 + 1.162 + "and r10, ip, r8, lsr #8 \n\t" /* r10 = ag masked by ip */ 1.163 + "mul r9, r9, r4 \n\t" /* br = br * scale */ 1.164 + "sub %[count], %[count], #2 \n\t" 1.165 + "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 1.166 + 1.167 + "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 1.168 + "and r10, r10, ip, lsl #8 \n\t" /* mask ag with reverse mask */ 1.169 + "cmp %[count], #1 \n\t" /* comparing count with 1 */ 1.170 + "orr r8, r9, r10 \n\t" /* br | ag */ 1.171 + 1.172 + "add r8, r6, r8 \n\t" /* dst = src + calc dest(r8) */ 1.173 + 1.174 + /* ----------------- */ 1.175 + "stm %[dst]!, {r7,r8} \n\t" /* *dst = r7, increment dst by two (each times 4) */ 1.176 + /* ----------------- */ 1.177 + 1.178 + "bgt 1b \n\t" /* if greater than 1 -> reloop */ 1.179 + "blt 3f \n\t" /* if less than 1 -> exit */ 1.180 + 1.181 + /* Single Loop */ 1.182 + "2: \n\t" /* <single loop> */ 1.183 + "ldr r5, [%[src]], #4 \n\t" /* load the src pointer into r5 r5=src */ 1.184 + "ldr r7, [%[dst]] \n\t" /* loading dst into r7 */ 1.185 + "lsr r4, r5, #24 \n\t" /* extracting the alpha from source and storing it to r4 */ 1.186 + 1.187 + /* ----------- */ 1.188 + "and r9, ip, r7 \n\t" /* r9 = br masked by ip */ 1.189 + "rsb r4, r4, #256 \n\t" /* subtracting the alpha from 256 -> r4=scale */ 1.190 + 1.191 + "and r10, ip, r7, lsr #8 \n\t" /* r10 = ag masked by ip */ 1.192 + "mul r9, r9, r4 \n\t" /* br = br * scale */ 1.193 + "mul r10, r10, r4 \n\t" /* ag = ag * scale */ 1.194 + "and r9, ip, r9, lsr #8 \n\t" /* lsr br by 8 and mask it */ 1.195 + 1.196 + "and r10, r10, ip, lsl #8 \n\t" /* mask ag */ 1.197 + "orr r7, r9, r10 \n\t" /* br | ag */ 1.198 + 1.199 + "add r7, r5, r7 \n\t" /* *dst = src + calc dest(r7) */ 1.200 + 1.201 + /* ----------------- */ 1.202 + "str r7, [%[dst]], #4 \n\t" /* *dst = r7, increment dst by one (times 4) */ 1.203 + /* ----------------- */ 1.204 + 1.205 + "3: \n\t" /* <exit> */ 1.206 + : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count) 1.207 + : 1.208 + : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "ip", "memory" 1.209 + ); 1.210 +} 1.211 + 1.212 +/* 1.213 + * ARM asm version of S32A_Blend_BlitRow32 1.214 + */ 1.215 +void S32A_Blend_BlitRow32_arm(SkPMColor* SK_RESTRICT dst, 1.216 + const SkPMColor* SK_RESTRICT src, 1.217 + int count, U8CPU alpha) { 1.218 + asm volatile ( 1.219 + "cmp %[count], #0 \n\t" /* comparing count with 0 */ 1.220 + "beq 3f \n\t" /* if zero exit */ 1.221 + 1.222 + "mov r12, #0xff \n\t" /* load the 0xff mask in r12 */ 1.223 + "orr r12, r12, r12, lsl #16 \n\t" /* convert it to 0xff00ff in r12 */ 1.224 + 1.225 + /* src1,2_scale */ 1.226 + "add %[alpha], %[alpha], #1 \n\t" /* loading %[alpha]=src_scale=alpha+1 */ 1.227 + 1.228 + "cmp %[count], #2 \n\t" /* comparing count with 2 */ 1.229 + "blt 2f \n\t" /* if less than 2 -> single loop */ 1.230 + 1.231 + /* Double Loop */ 1.232 + "1: \n\t" /* <double loop> */ 1.233 + "ldm %[src]!, {r5, r6} \n\t" /* loading src pointers into r5 and r6 */ 1.234 + "ldm %[dst], {r7, r8} \n\t" /* loading dst pointers into r7 and r8 */ 1.235 + 1.236 + /* dst1_scale and dst2_scale*/ 1.237 + "lsr r9, r5, #24 \n\t" /* src >> 24 */ 1.238 + "lsr r10, r6, #24 \n\t" /* src >> 24 */ 1.239 +#ifdef SK_ARM_HAS_EDSP 1.240 + "smulbb r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 1.241 + "smulbb r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 1.242 +#else 1.243 + "mul r9, r9, %[alpha] \n\t" /* r9 = SkMulS16 r9 with src_scale */ 1.244 + "mul r10, r10, %[alpha] \n\t" /* r10 = SkMulS16 r10 with src_scale */ 1.245 +#endif 1.246 + "lsr r9, r9, #8 \n\t" /* r9 >> 8 */ 1.247 + "lsr r10, r10, #8 \n\t" /* r10 >> 8 */ 1.248 + "rsb r9, r9, #256 \n\t" /* dst1_scale = r9 = 255 - r9 + 1 */ 1.249 + "rsb r10, r10, #256 \n\t" /* dst2_scale = r10 = 255 - r10 + 1 */ 1.250 + 1.251 + /* ---------------------- */ 1.252 + 1.253 + /* src1, src1_scale */ 1.254 + "and r11, r12, r5, lsr #8 \n\t" /* ag = r11 = r5 masked by r12 lsr by #8 */ 1.255 + "and r4, r12, r5 \n\t" /* rb = r4 = r5 masked by r12 */ 1.256 + "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 1.257 + "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 1.258 + "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1.259 + "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1.260 + "orr r5, r11, r4 \n\t" /* r5 = (src1, src_scale) */ 1.261 + 1.262 + /* dst1, dst1_scale */ 1.263 + "and r11, r12, r7, lsr #8 \n\t" /* ag = r11 = r7 masked by r12 lsr by #8 */ 1.264 + "and r4, r12, r7 \n\t" /* rb = r4 = r7 masked by r12 */ 1.265 + "mul r11, r11, r9 \n\t" /* ag = r11 times dst_scale (r9) */ 1.266 + "mul r4, r4, r9 \n\t" /* rb = r4 times dst_scale (r9) */ 1.267 + "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1.268 + "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1.269 + "orr r9, r11, r4 \n\t" /* r9 = (dst1, dst_scale) */ 1.270 + 1.271 + /* ---------------------- */ 1.272 + "add r9, r5, r9 \n\t" /* *dst = src plus dst both scaled */ 1.273 + /* ---------------------- */ 1.274 + 1.275 + /* ====================== */ 1.276 + 1.277 + /* src2, src2_scale */ 1.278 + "and r11, r12, r6, lsr #8 \n\t" /* ag = r11 = r6 masked by r12 lsr by #8 */ 1.279 + "and r4, r12, r6 \n\t" /* rb = r4 = r6 masked by r12 */ 1.280 + "mul r11, r11, %[alpha] \n\t" /* ag = r11 times src_scale */ 1.281 + "mul r4, r4, %[alpha] \n\t" /* rb = r4 times src_scale */ 1.282 + "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1.283 + "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1.284 + "orr r6, r11, r4 \n\t" /* r6 = (src2, src_scale) */ 1.285 + 1.286 + /* dst2, dst2_scale */ 1.287 + "and r11, r12, r8, lsr #8 \n\t" /* ag = r11 = r8 masked by r12 lsr by #8 */ 1.288 + "and r4, r12, r8 \n\t" /* rb = r4 = r8 masked by r12 */ 1.289 + "mul r11, r11, r10 \n\t" /* ag = r11 times dst_scale (r10) */ 1.290 + "mul r4, r4, r10 \n\t" /* rb = r4 times dst_scale (r6) */ 1.291 + "and r11, r11, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1.292 + "and r4, r12, r4, lsr #8 \n\t" /* rb masked by mask (r12) */ 1.293 + "orr r10, r11, r4 \n\t" /* r10 = (dst2, dst_scale) */ 1.294 + 1.295 + "sub %[count], %[count], #2 \n\t" /* decrease count by 2 */ 1.296 + /* ---------------------- */ 1.297 + "add r10, r6, r10 \n\t" /* *dst = src plus dst both scaled */ 1.298 + /* ---------------------- */ 1.299 + "cmp %[count], #1 \n\t" /* compare count with 1 */ 1.300 + /* ----------------- */ 1.301 + "stm %[dst]!, {r9, r10} \n\t" /* copy r9 and r10 to r7 and r8 respectively */ 1.302 + /* ----------------- */ 1.303 + 1.304 + "bgt 1b \n\t" /* if %[count] greater than 1 reloop */ 1.305 + "blt 3f \n\t" /* if %[count] less than 1 exit */ 1.306 + /* else get into the single loop */ 1.307 + /* Single Loop */ 1.308 + "2: \n\t" /* <single loop> */ 1.309 + "ldr r5, [%[src]], #4 \n\t" /* loading src pointer into r5: r5=src */ 1.310 + "ldr r7, [%[dst]] \n\t" /* loading dst pointer into r7: r7=dst */ 1.311 + 1.312 + "lsr r6, r5, #24 \n\t" /* src >> 24 */ 1.313 + "and r8, r12, r5, lsr #8 \n\t" /* ag = r8 = r5 masked by r12 lsr by #8 */ 1.314 +#ifdef SK_ARM_HAS_EDSP 1.315 + "smulbb r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 1.316 +#else 1.317 + "mul r6, r6, %[alpha] \n\t" /* r6 = SkMulS16 with src_scale */ 1.318 +#endif 1.319 + "and r9, r12, r5 \n\t" /* rb = r9 = r5 masked by r12 */ 1.320 + "lsr r6, r6, #8 \n\t" /* r6 >> 8 */ 1.321 + "mul r8, r8, %[alpha] \n\t" /* ag = r8 times scale */ 1.322 + "rsb r6, r6, #256 \n\t" /* r6 = 255 - r6 + 1 */ 1.323 + 1.324 + /* src, src_scale */ 1.325 + "mul r9, r9, %[alpha] \n\t" /* rb = r9 times scale */ 1.326 + "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1.327 + "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 1.328 + "orr r10, r8, r9 \n\t" /* r10 = (scr, src_scale) */ 1.329 + 1.330 + /* dst, dst_scale */ 1.331 + "and r8, r12, r7, lsr #8 \n\t" /* ag = r8 = r7 masked by r12 lsr by #8 */ 1.332 + "and r9, r12, r7 \n\t" /* rb = r9 = r7 masked by r12 */ 1.333 + "mul r8, r8, r6 \n\t" /* ag = r8 times scale (r6) */ 1.334 + "mul r9, r9, r6 \n\t" /* rb = r9 times scale (r6) */ 1.335 + "and r8, r8, r12, lsl #8 \n\t" /* ag masked by reverse mask (r12) */ 1.336 + "and r9, r12, r9, lsr #8 \n\t" /* rb masked by mask (r12) */ 1.337 + "orr r7, r8, r9 \n\t" /* r7 = (dst, dst_scale) */ 1.338 + 1.339 + "add r10, r7, r10 \n\t" /* *dst = src plus dst both scaled */ 1.340 + 1.341 + /* ----------------- */ 1.342 + "str r10, [%[dst]], #4 \n\t" /* *dst = r10, postincrement dst by one (times 4) */ 1.343 + /* ----------------- */ 1.344 + 1.345 + "3: \n\t" /* <exit> */ 1.346 + : [dst] "+r" (dst), [src] "+r" (src), [count] "+r" (count), [alpha] "+r" (alpha) 1.347 + : 1.348 + : "cc", "r4", "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", "memory" 1.349 + ); 1.350 + 1.351 +} 1.352 + 1.353 +/////////////////////////////////////////////////////////////////////////////// 1.354 + 1.355 +static const SkBlitRow::Proc sk_blitrow_platform_565_procs_arm[] = { 1.356 + // no dither 1.357 + // NOTE: For the functions below, we don't have a special version 1.358 + // that assumes that each source pixel is opaque. But our S32A is 1.359 + // still faster than the default, so use it. 1.360 + S32A_D565_Opaque, // S32_D565_Opaque 1.361 + NULL, // S32_D565_Blend 1.362 + S32A_D565_Opaque, // S32A_D565_Opaque 1.363 + NULL, // S32A_D565_Blend 1.364 + 1.365 + // dither 1.366 + NULL, // S32_D565_Opaque_Dither 1.367 + NULL, // S32_D565_Blend_Dither 1.368 + NULL, // S32A_D565_Opaque_Dither 1.369 + NULL, // S32A_D565_Blend_Dither 1.370 +}; 1.371 + 1.372 +static const SkBlitRow::Proc32 sk_blitrow_platform_32_procs_arm[] = { 1.373 + NULL, // S32_Opaque, 1.374 + NULL, // S32_Blend, 1.375 + S32A_Opaque_BlitRow32_arm, // S32A_Opaque, 1.376 + S32A_Blend_BlitRow32_arm // S32A_Blend 1.377 +}; 1.378 + 1.379 +#endif // USE_ARM_CODE 1.380 + 1.381 +SkBlitRow::Proc SkBlitRow::PlatformProcs565(unsigned flags) { 1.382 + return SK_ARM_NEON_WRAP(sk_blitrow_platform_565_procs_arm)[flags]; 1.383 +} 1.384 + 1.385 +SkBlitRow::Proc32 SkBlitRow::PlatformProcs32(unsigned flags) { 1.386 + return SK_ARM_NEON_WRAP(sk_blitrow_platform_32_procs_arm)[flags]; 1.387 +} 1.388 + 1.389 +/////////////////////////////////////////////////////////////////////////////// 1.390 +#define Color32_arm NULL 1.391 +SkBlitRow::ColorProc SkBlitRow::PlatformColorProc() { 1.392 + return SK_ARM_NEON_WRAP(Color32_arm); 1.393 +} 1.394 + 1.395 +SkBlitRow::ColorRectProc PlatformColorRectProcFactory() { 1.396 + return NULL; 1.397 +}