1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/memset16_neon.S Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,143 @@ 1.4 +/*************************************************************************** 1.5 + * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license that can be 1.8 + * found in the LICENSE file. 1.9 + ***************************************************************************/ 1.10 + 1.11 +/*************************************************************************** 1.12 + Neon memset: Attempts to do a memset with Neon registers if possible, 1.13 + Inputs: 1.14 + s: The buffer to write to 1.15 + c: The integer data to write to the buffer 1.16 + n: The size_t count. 1.17 + Outputs: 1.18 + 1.19 +***************************************************************************/ 1.20 + 1.21 + .code 32 1.22 + .fpu neon 1.23 + .align 4 1.24 + .globl memset16_neon 1.25 + .func 1.26 + 1.27 +memset16_neon: 1.28 + cmp r2, #0 1.29 + bxeq lr 1.30 + 1.31 + /* Keep in mind that r2 -- the count argument -- is for the 1.32 + * number of 16-bit items to copy. 1.33 + */ 1.34 + lsl r2, r2, #1 1.35 + 1.36 + push {r0} 1.37 + 1.38 + /* If we have < 8 bytes, just do a quick loop to handle that */ 1.39 + cmp r2, #8 1.40 + bgt memset_gt4 1.41 +memset_smallcopy_loop: 1.42 + strh r1, [r0], #2 1.43 + subs r2, r2, #2 1.44 + bne memset_smallcopy_loop 1.45 +memset_smallcopy_done: 1.46 + pop {r0} 1.47 + bx lr 1.48 + 1.49 +memset_gt4: 1.50 + /* 1.51 + * Duplicate the r1 lowest 16-bits across r1. The idea is to have 1.52 + * a register with two 16-bit-values we can copy. We do this by 1.53 + * duplicating lowest 16-bits of r1 to upper 16-bits. 1.54 + */ 1.55 + orr r1, r1, r1, lsl #16 1.56 + /* 1.57 + * If we're copying > 64 bytes, then we may want to get 1.58 + * onto a 16-byte boundary to improve speed even more. 1.59 + */ 1.60 + cmp r2, #64 1.61 + blt memset_route 1.62 + ands r12, r0, #0xf 1.63 + beq memset_route 1.64 + /* 1.65 + * Determine the number of bytes to move forward to get to the 16-byte 1.66 + * boundary. Note that this will be a multiple of 4, since we 1.67 + * already are word-aligned. 1.68 + */ 1.69 + rsb r12, r12, #16 1.70 + sub r2, r2, r12 1.71 + lsls r12, r12, #29 1.72 + strmi r1, [r0], #4 1.73 + strcs r1, [r0], #4 1.74 + strcs r1, [r0], #4 1.75 + lsls r12, r12, #2 1.76 + strcsh r1, [r0], #2 1.77 +memset_route: 1.78 + /* 1.79 + * Decide where to route for the maximum copy sizes. Note that we 1.80 + * build q0 and q1 depending on if we'll need it, so that's 1.81 + * interwoven here as well. 1.82 + */ 1.83 + vdup.u32 d0, r1 1.84 + cmp r2, #16 1.85 + blt memset_8 1.86 + vmov d1, d0 1.87 + cmp r2, #64 1.88 + blt memset_16 1.89 + vmov q1, q0 1.90 + cmp r2, #128 1.91 + blt memset_32 1.92 +memset_128: 1.93 + mov r12, r2, lsr #7 1.94 +memset_128_loop: 1.95 + vst1.64 {q0, q1}, [r0]! 1.96 + vst1.64 {q0, q1}, [r0]! 1.97 + vst1.64 {q0, q1}, [r0]! 1.98 + vst1.64 {q0, q1}, [r0]! 1.99 + subs r12, r12, #1 1.100 + bne memset_128_loop 1.101 + ands r2, r2, #0x7f 1.102 + beq memset_end 1.103 +memset_32: 1.104 + movs r12, r2, lsr #5 1.105 + beq memset_16 1.106 +memset_32_loop: 1.107 + subs r12, r12, #1 1.108 + vst1.64 {q0, q1}, [r0]! 1.109 + bne memset_32_loop 1.110 + ands r2, r2, #0x1f 1.111 + beq memset_end 1.112 +memset_16: 1.113 + movs r12, r2, lsr #4 1.114 + beq memset_8 1.115 +memset_16_loop: 1.116 + subs r12, r12, #1 1.117 + vst1.32 {q0}, [r0]! 1.118 + bne memset_16_loop 1.119 + ands r2, r2, #0xf 1.120 + beq memset_end 1.121 + /* 1.122 + * memset_8 isn't a loop, since we try to do our loops at 16 1.123 + * bytes and above. We should loop there, then drop down here 1.124 + * to finish the <16-byte versions. Same for memset_4 and 1.125 + * memset_1. 1.126 + */ 1.127 +memset_8: 1.128 + cmp r2, #8 1.129 + blt memset_4 1.130 + subs r2, r2, #8 1.131 + vst1.32 {d0}, [r0]! 1.132 +memset_4: 1.133 + cmp r2, #4 1.134 + blt memset_2 1.135 + subs r2, r2, #4 1.136 + str r1, [r0], #4 1.137 +memset_2: 1.138 + cmp r2, #0 1.139 + ble memset_end 1.140 + strh r1, [r0], #2 1.141 +memset_end: 1.142 + pop {r0} 1.143 + bx lr 1.144 + 1.145 + .endfunc 1.146 + .end