michael@0: /***************************************************************************
michael@0:  * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
michael@0:  *
michael@0:  * Use of this source code is governed by a BSD-style license that can be
michael@0:  * found in the LICENSE file.
michael@0:  ***************************************************************************/
michael@0: 
michael@0: /***************************************************************************
michael@0:   Neon memset: Attempts to do a memset with Neon registers if possible,
michael@0:      Inputs:
michael@0:         s: The buffer to write to
michael@0:         c: The integer data to write to the buffer
michael@0:         n: The size_t count.
michael@0:      Outputs:
michael@0: 
michael@0: ***************************************************************************/
michael@0: 
michael@0:         .code 32
michael@0:         .fpu neon
michael@0:         .align 4
michael@0:         .globl memset16_neon
michael@0:         .func
michael@0: 
michael@0: memset16_neon:
michael@0:         cmp             r2, #0
michael@0:         bxeq            lr
michael@0: 
michael@0:         /* Keep in mind that r2 -- the count argument -- is for the
michael@0:          * number of 16-bit items to copy.
michael@0:          */
michael@0:         lsl             r2, r2, #1
michael@0: 
michael@0:         push            {r0}
michael@0: 
michael@0:         /* If we have < 8 bytes, just do a quick loop to handle that */
michael@0:         cmp             r2, #8
michael@0:         bgt             memset_gt4
michael@0: memset_smallcopy_loop:
michael@0:         strh            r1, [r0], #2
michael@0:         subs            r2, r2, #2
michael@0:         bne             memset_smallcopy_loop
michael@0: memset_smallcopy_done:
michael@0:         pop             {r0}
michael@0:         bx              lr
michael@0: 
michael@0: memset_gt4:
michael@0:         /*
michael@0:          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
michael@0:          * a register with two 16-bit-values we can copy. We do this by
michael@0:          * duplicating lowest 16-bits of r1 to upper 16-bits.
michael@0:          */
michael@0:         orr             r1, r1, r1, lsl #16
michael@0:         /*
michael@0:          * If we're copying > 64 bytes, then we may want to get
michael@0:          * onto a 16-byte boundary to improve speed even more.
michael@0:          */
michael@0:         cmp             r2, #64
michael@0:         blt             memset_route
michael@0:         ands            r12, r0, #0xf
michael@0:         beq             memset_route
michael@0:         /*
michael@0:          * Determine the number of bytes to move forward to get to the 16-byte
michael@0:          * boundary.  Note that this will be a multiple of 4, since we
michael@0:          * already are word-aligned.
michael@0:          */
michael@0:         rsb             r12, r12, #16
michael@0:         sub             r2, r2, r12
michael@0:         lsls            r12, r12, #29
michael@0:         strmi           r1, [r0], #4
michael@0:         strcs           r1, [r0], #4
michael@0:         strcs           r1, [r0], #4
michael@0:         lsls            r12, r12, #2
michael@0:         strcsh          r1, [r0], #2
michael@0: memset_route:
michael@0:         /*
michael@0:          * Decide where to route for the maximum copy sizes.  Note that we
michael@0:          * build q0 and q1 depending on if we'll need it, so that's
michael@0:          * interwoven here as well.
michael@0:          */
michael@0:         vdup.u32        d0, r1
michael@0:         cmp             r2, #16
michael@0:         blt             memset_8
michael@0:         vmov            d1, d0
michael@0:         cmp             r2, #64
michael@0:         blt             memset_16
michael@0:         vmov            q1, q0
michael@0:         cmp             r2, #128
michael@0:         blt             memset_32
michael@0: memset_128:
michael@0:         mov             r12, r2, lsr #7
michael@0: memset_128_loop:
michael@0:         vst1.64         {q0, q1}, [r0]!
michael@0:         vst1.64         {q0, q1}, [r0]!
michael@0:         vst1.64         {q0, q1}, [r0]!
michael@0:         vst1.64         {q0, q1}, [r0]!
michael@0:         subs            r12, r12, #1
michael@0:         bne             memset_128_loop
michael@0:         ands            r2, r2, #0x7f
michael@0:         beq             memset_end
michael@0: memset_32:
michael@0:         movs            r12, r2, lsr #5
michael@0:         beq             memset_16
michael@0: memset_32_loop:
michael@0:         subs            r12, r12, #1
michael@0:         vst1.64         {q0, q1}, [r0]!
michael@0:         bne             memset_32_loop
michael@0:         ands            r2, r2, #0x1f
michael@0:         beq             memset_end
michael@0: memset_16:
michael@0:         movs            r12, r2, lsr #4
michael@0:         beq             memset_8
michael@0: memset_16_loop:
michael@0:         subs            r12, r12, #1
michael@0:         vst1.32         {q0}, [r0]!
michael@0:         bne             memset_16_loop
michael@0:         ands            r2, r2, #0xf
michael@0:         beq             memset_end
michael@0:         /*
michael@0:          * memset_8 isn't a loop, since we try to do our loops at 16
michael@0:          * bytes and above.  We should loop there, then drop down here
michael@0:          * to finish the <16-byte versions.  Same for memset_4 and
michael@0:          * memset_1.
michael@0:          */
michael@0: memset_8:
michael@0:         cmp             r2, #8
michael@0:         blt             memset_4
michael@0:         subs            r2, r2, #8
michael@0:         vst1.32         {d0}, [r0]!
michael@0: memset_4:
michael@0:         cmp             r2, #4
michael@0:         blt             memset_2
michael@0:         subs            r2, r2, #4
michael@0:         str             r1, [r0], #4
michael@0: memset_2:
michael@0:         cmp             r2, #0
michael@0:         ble             memset_end
michael@0:         strh            r1, [r0], #2
michael@0: memset_end:
michael@0:         pop             {r0}
michael@0:         bx              lr
michael@0: 
michael@0:         .endfunc
michael@0:         .end