michael@0: /*************************************************************************** michael@0: * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. michael@0: * michael@0: * Use of this source code is governed by a BSD-style license that can be michael@0: * found in the LICENSE file. michael@0: ***************************************************************************/ michael@0: michael@0: /*************************************************************************** michael@0: Neon memset: Attempts to do a memset with Neon registers if possible, michael@0: Inputs: michael@0: s: The buffer to write to michael@0: c: The integer data to write to the buffer michael@0: n: The size_t count. michael@0: Outputs: michael@0: michael@0: ***************************************************************************/ michael@0: michael@0: .code 32 michael@0: .fpu neon michael@0: .align 4 michael@0: .globl memset16_neon michael@0: .func michael@0: michael@0: memset16_neon: michael@0: cmp r2, #0 michael@0: bxeq lr michael@0: michael@0: /* Keep in mind that r2 -- the count argument -- is for the michael@0: * number of 16-bit items to copy. michael@0: */ michael@0: lsl r2, r2, #1 michael@0: michael@0: push {r0} michael@0: michael@0: /* If we have < 8 bytes, just do a quick loop to handle that */ michael@0: cmp r2, #8 michael@0: bgt memset_gt4 michael@0: memset_smallcopy_loop: michael@0: strh r1, [r0], #2 michael@0: subs r2, r2, #2 michael@0: bne memset_smallcopy_loop michael@0: memset_smallcopy_done: michael@0: pop {r0} michael@0: bx lr michael@0: michael@0: memset_gt4: michael@0: /* michael@0: * Duplicate the r1 lowest 16-bits across r1. The idea is to have michael@0: * a register with two 16-bit-values we can copy. We do this by michael@0: * duplicating lowest 16-bits of r1 to upper 16-bits. michael@0: */ michael@0: orr r1, r1, r1, lsl #16 michael@0: /* michael@0: * If we're copying > 64 bytes, then we may want to get michael@0: * onto a 16-byte boundary to improve speed even more. michael@0: */ michael@0: cmp r2, #64 michael@0: blt memset_route michael@0: ands r12, r0, #0xf michael@0: beq memset_route michael@0: /* michael@0: * Determine the number of bytes to move forward to get to the 16-byte michael@0: * boundary. Note that this will be a multiple of 4, since we michael@0: * already are word-aligned. michael@0: */ michael@0: rsb r12, r12, #16 michael@0: sub r2, r2, r12 michael@0: lsls r12, r12, #29 michael@0: strmi r1, [r0], #4 michael@0: strcs r1, [r0], #4 michael@0: strcs r1, [r0], #4 michael@0: lsls r12, r12, #2 michael@0: strcsh r1, [r0], #2 michael@0: memset_route: michael@0: /* michael@0: * Decide where to route for the maximum copy sizes. Note that we michael@0: * build q0 and q1 depending on if we'll need it, so that's michael@0: * interwoven here as well. michael@0: */ michael@0: vdup.u32 d0, r1 michael@0: cmp r2, #16 michael@0: blt memset_8 michael@0: vmov d1, d0 michael@0: cmp r2, #64 michael@0: blt memset_16 michael@0: vmov q1, q0 michael@0: cmp r2, #128 michael@0: blt memset_32 michael@0: memset_128: michael@0: mov r12, r2, lsr #7 michael@0: memset_128_loop: michael@0: vst1.64 {q0, q1}, [r0]! michael@0: vst1.64 {q0, q1}, [r0]! michael@0: vst1.64 {q0, q1}, [r0]! michael@0: vst1.64 {q0, q1}, [r0]! michael@0: subs r12, r12, #1 michael@0: bne memset_128_loop michael@0: ands r2, r2, #0x7f michael@0: beq memset_end michael@0: memset_32: michael@0: movs r12, r2, lsr #5 michael@0: beq memset_16 michael@0: memset_32_loop: michael@0: subs r12, r12, #1 michael@0: vst1.64 {q0, q1}, [r0]! michael@0: bne memset_32_loop michael@0: ands r2, r2, #0x1f michael@0: beq memset_end michael@0: memset_16: michael@0: movs r12, r2, lsr #4 michael@0: beq memset_8 michael@0: memset_16_loop: michael@0: subs r12, r12, #1 michael@0: vst1.32 {q0}, [r0]! michael@0: bne memset_16_loop michael@0: ands r2, r2, #0xf michael@0: beq memset_end michael@0: /* michael@0: * memset_8 isn't a loop, since we try to do our loops at 16 michael@0: * bytes and above. We should loop there, then drop down here michael@0: * to finish the <16-byte versions. Same for memset_4 and michael@0: * memset_1. michael@0: */ michael@0: memset_8: michael@0: cmp r2, #8 michael@0: blt memset_4 michael@0: subs r2, r2, #8 michael@0: vst1.32 {d0}, [r0]! michael@0: memset_4: michael@0: cmp r2, #4 michael@0: blt memset_2 michael@0: subs r2, r2, #4 michael@0: str r1, [r0], #4 michael@0: memset_2: michael@0: cmp r2, #0 michael@0: ble memset_end michael@0: strh r1, [r0], #2 michael@0: memset_end: michael@0: pop {r0} michael@0: bx lr michael@0: michael@0: .endfunc michael@0: .end