michael@0: /* michael@0: * Copyright 2010 The Android Open Source Project michael@0: * michael@0: * Use of this source code is governed by a BSD-style license that can be michael@0: * found in the LICENSE file. michael@0: */ michael@0: michael@0: /* Changes: michael@0: * 2010-08-11 Steve McIntyre michael@0: * Added small changes to the two functions to make them work on the michael@0: * specified number of 16- or 32-bit values rather than the original michael@0: * code which was specified as a count of bytes. More verbose comments michael@0: * to aid future maintenance. michael@0: */ michael@0: michael@0: .text michael@0: .align 4 michael@0: .syntax unified michael@0: michael@0: .global arm_memset32 michael@0: .type arm_memset32, %function michael@0: .global arm_memset16 michael@0: .type arm_memset16, %function michael@0: michael@0: /* michael@0: * Optimized memset functions for ARM. michael@0: * michael@0: * void arm_memset16(uint16_t* dst, uint16_t value, int count); michael@0: * void arm_memset32(uint32_t* dst, uint32_t value, int count); michael@0: * michael@0: */ michael@0: arm_memset16: michael@0: .fnstart michael@0: push {lr} michael@0: michael@0: /* if count is equal to zero then abort */ michael@0: teq r2, #0 michael@0: ble .Lfinish michael@0: michael@0: /* Multiply count by 2 - go from the number of 16-bit shorts michael@0: * to the number of bytes desired. */ michael@0: mov r2, r2, lsl #1 michael@0: michael@0: /* expand the data to 32 bits */ michael@0: orr r1, r1, r1, lsl #16 michael@0: michael@0: /* align to 32 bits */ michael@0: tst r0, #2 michael@0: strhne r1, [r0], #2 michael@0: subne r2, r2, #2 michael@0: michael@0: /* Now jump into the main loop below. */ michael@0: b .Lwork_32 michael@0: .fnend michael@0: michael@0: arm_memset32: michael@0: .fnstart michael@0: push {lr} michael@0: michael@0: /* if count is equal to zero then abort */ michael@0: teq r2, #0 michael@0: ble .Lfinish michael@0: michael@0: /* Multiply count by 4 - go from the number of 32-bit words to michael@0: * the number of bytes desired. */ michael@0: mov r2, r2, lsl #2 michael@0: michael@0: .Lwork_32: michael@0: /* Set up registers ready for writing them out. */ michael@0: mov ip, r1 michael@0: mov lr, r1 michael@0: michael@0: /* Try to align the destination to a cache line. Assume 32 michael@0: * byte (8 word) cache lines, it's the common case. */ michael@0: rsb r3, r0, #0 michael@0: ands r3, r3, #0x1C michael@0: beq .Laligned32 michael@0: cmp r3, r2 michael@0: andhi r3, r2, #0x1C michael@0: sub r2, r2, r3 michael@0: michael@0: /* (Optionally) write any unaligned leading bytes. michael@0: * (0-28 bytes, length in r3) */ michael@0: movs r3, r3, lsl #28 michael@0: stmiacs r0!, {r1, lr} michael@0: stmiacs r0!, {r1, lr} michael@0: stmiami r0!, {r1, lr} michael@0: movs r3, r3, lsl #2 michael@0: strcs r1, [r0], #4 michael@0: michael@0: /* Now quickly loop through the cache-aligned data. */ michael@0: .Laligned32: michael@0: mov r3, r1 michael@0: 1: subs r2, r2, #32 michael@0: stmiahs r0!, {r1,r3,ip,lr} michael@0: stmiahs r0!, {r1,r3,ip,lr} michael@0: bhs 1b michael@0: add r2, r2, #32 michael@0: michael@0: /* (Optionally) store any remaining trailing bytes. michael@0: * (0-30 bytes, length in r2) */ michael@0: movs r2, r2, lsl #28 michael@0: stmiacs r0!, {r1,r3,ip,lr} michael@0: stmiami r0!, {r1,lr} michael@0: movs r2, r2, lsl #2 michael@0: strcs r1, [r0], #4 michael@0: strhmi lr, [r0], #2 michael@0: michael@0: .Lfinish: michael@0: pop {pc} michael@0: .fnend