michael@0: /*
michael@0:  * Copyright 2010 The Android Open Source Project
michael@0:  *
michael@0:  * Use of this source code is governed by a BSD-style license that can be
michael@0:  * found in the LICENSE file.
michael@0:  */
michael@0: 
michael@0: /* Changes:
michael@0:  * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com>
michael@0:  *    Added small changes to the two functions to make them work on the
michael@0:  *    specified number of 16- or 32-bit values rather than the original
michael@0:  *    code which was specified as a count of bytes. More verbose comments
michael@0:  *    to aid future maintenance.
michael@0:  */
michael@0: 
michael@0:     .text
michael@0:     .align 4
michael@0:     .syntax unified
michael@0: 
michael@0:     .global arm_memset32
michael@0:     .type   arm_memset32, %function
michael@0:     .global arm_memset16
michael@0:     .type   arm_memset16, %function
michael@0: 
michael@0: /*
michael@0:  * Optimized memset functions for ARM.
michael@0:  *
michael@0:  * void arm_memset16(uint16_t* dst, uint16_t value, int count);
michael@0:  * void arm_memset32(uint32_t* dst, uint32_t value, int count);
michael@0:  *
michael@0:  */
michael@0: arm_memset16:
michael@0:         .fnstart
michael@0:         push        {lr}
michael@0: 
michael@0:         /* if count is equal to zero then abort */
michael@0:         teq         r2, #0
michael@0:         ble         .Lfinish
michael@0: 
michael@0:         /* Multiply count by 2 - go from the number of 16-bit shorts
michael@0:          * to the number of bytes desired. */
michael@0:         mov         r2, r2, lsl #1
michael@0: 
michael@0:         /* expand the data to 32 bits */
michael@0:         orr         r1, r1, r1, lsl #16
michael@0: 
michael@0:         /* align to 32 bits */
michael@0:         tst         r0, #2
michael@0:         strhne      r1, [r0], #2
michael@0:         subne       r2, r2, #2
michael@0: 
michael@0:         /* Now jump into the main loop below. */
michael@0:         b           .Lwork_32
michael@0:         .fnend
michael@0: 
michael@0: arm_memset32:
michael@0:         .fnstart
michael@0:         push        {lr}
michael@0: 
michael@0:         /* if count is equal to zero then abort */
michael@0:         teq         r2, #0
michael@0:         ble         .Lfinish
michael@0: 
michael@0:         /* Multiply count by 4 - go from the number of 32-bit words to
michael@0:          * the number of bytes desired. */
michael@0:         mov         r2, r2, lsl #2
michael@0: 
michael@0: .Lwork_32:
michael@0:         /* Set up registers ready for writing them out. */
michael@0:         mov         ip, r1
michael@0:         mov         lr, r1
michael@0: 
michael@0:         /* Try to align the destination to a cache line. Assume 32
michael@0:          * byte (8 word) cache lines, it's the common case. */
michael@0:         rsb         r3, r0, #0
michael@0:         ands        r3, r3, #0x1C
michael@0:         beq         .Laligned32
michael@0:         cmp         r3, r2
michael@0:         andhi       r3, r2, #0x1C
michael@0:         sub         r2, r2, r3
michael@0: 
michael@0:         /* (Optionally) write any unaligned leading bytes.
michael@0:          * (0-28 bytes, length in r3) */
michael@0:         movs        r3, r3, lsl #28
michael@0:         stmiacs     r0!, {r1, lr}
michael@0:         stmiacs     r0!, {r1, lr}
michael@0:         stmiami     r0!, {r1, lr}
michael@0:         movs        r3, r3, lsl #2
michael@0:         strcs       r1, [r0], #4
michael@0: 
michael@0:         /* Now quickly loop through the cache-aligned data. */
michael@0: .Laligned32:
michael@0:         mov         r3, r1
michael@0: 1:      subs        r2, r2, #32
michael@0:         stmiahs     r0!, {r1,r3,ip,lr}
michael@0:         stmiahs     r0!, {r1,r3,ip,lr}
michael@0:         bhs         1b
michael@0:         add         r2, r2, #32
michael@0: 
michael@0:         /* (Optionally) store any remaining trailing bytes.
michael@0:          * (0-30 bytes, length in r2) */
michael@0:         movs        r2, r2, lsl #28
michael@0:         stmiacs     r0!, {r1,r3,ip,lr}
michael@0:         stmiami     r0!, {r1,lr}
michael@0:         movs        r2, r2, lsl #2
michael@0:         strcs       r1, [r0], #4
michael@0:         strhmi      lr, [r0], #2
michael@0: 
michael@0: .Lfinish:
michael@0:         pop         {pc}
michael@0:         .fnend