1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/skia/trunk/src/opts/memset.arm.S Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,111 @@ 1.4 +/* 1.5 + * Copyright 2010 The Android Open Source Project 1.6 + * 1.7 + * Use of this source code is governed by a BSD-style license that can be 1.8 + * found in the LICENSE file. 1.9 + */ 1.10 + 1.11 +/* Changes: 1.12 + * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com> 1.13 + * Added small changes to the two functions to make them work on the 1.14 + * specified number of 16- or 32-bit values rather than the original 1.15 + * code which was specified as a count of bytes. More verbose comments 1.16 + * to aid future maintenance. 1.17 + */ 1.18 + 1.19 + .text 1.20 + .align 4 1.21 + .syntax unified 1.22 + 1.23 + .global arm_memset32 1.24 + .type arm_memset32, %function 1.25 + .global arm_memset16 1.26 + .type arm_memset16, %function 1.27 + 1.28 +/* 1.29 + * Optimized memset functions for ARM. 1.30 + * 1.31 + * void arm_memset16(uint16_t* dst, uint16_t value, int count); 1.32 + * void arm_memset32(uint32_t* dst, uint32_t value, int count); 1.33 + * 1.34 + */ 1.35 +arm_memset16: 1.36 + .fnstart 1.37 + push {lr} 1.38 + 1.39 + /* if count is equal to zero then abort */ 1.40 + teq r2, #0 1.41 + ble .Lfinish 1.42 + 1.43 + /* Multiply count by 2 - go from the number of 16-bit shorts 1.44 + * to the number of bytes desired. */ 1.45 + mov r2, r2, lsl #1 1.46 + 1.47 + /* expand the data to 32 bits */ 1.48 + orr r1, r1, r1, lsl #16 1.49 + 1.50 + /* align to 32 bits */ 1.51 + tst r0, #2 1.52 + strhne r1, [r0], #2 1.53 + subne r2, r2, #2 1.54 + 1.55 + /* Now jump into the main loop below. */ 1.56 + b .Lwork_32 1.57 + .fnend 1.58 + 1.59 +arm_memset32: 1.60 + .fnstart 1.61 + push {lr} 1.62 + 1.63 + /* if count is equal to zero then abort */ 1.64 + teq r2, #0 1.65 + ble .Lfinish 1.66 + 1.67 + /* Multiply count by 4 - go from the number of 32-bit words to 1.68 + * the number of bytes desired. */ 1.69 + mov r2, r2, lsl #2 1.70 + 1.71 +.Lwork_32: 1.72 + /* Set up registers ready for writing them out. */ 1.73 + mov ip, r1 1.74 + mov lr, r1 1.75 + 1.76 + /* Try to align the destination to a cache line. Assume 32 1.77 + * byte (8 word) cache lines, it's the common case. */ 1.78 + rsb r3, r0, #0 1.79 + ands r3, r3, #0x1C 1.80 + beq .Laligned32 1.81 + cmp r3, r2 1.82 + andhi r3, r2, #0x1C 1.83 + sub r2, r2, r3 1.84 + 1.85 + /* (Optionally) write any unaligned leading bytes. 1.86 + * (0-28 bytes, length in r3) */ 1.87 + movs r3, r3, lsl #28 1.88 + stmiacs r0!, {r1, lr} 1.89 + stmiacs r0!, {r1, lr} 1.90 + stmiami r0!, {r1, lr} 1.91 + movs r3, r3, lsl #2 1.92 + strcs r1, [r0], #4 1.93 + 1.94 + /* Now quickly loop through the cache-aligned data. */ 1.95 +.Laligned32: 1.96 + mov r3, r1 1.97 +1: subs r2, r2, #32 1.98 + stmiahs r0!, {r1,r3,ip,lr} 1.99 + stmiahs r0!, {r1,r3,ip,lr} 1.100 + bhs 1b 1.101 + add r2, r2, #32 1.102 + 1.103 + /* (Optionally) store any remaining trailing bytes. 1.104 + * (0-30 bytes, length in r2) */ 1.105 + movs r2, r2, lsl #28 1.106 + stmiacs r0!, {r1,r3,ip,lr} 1.107 + stmiami r0!, {r1,lr} 1.108 + movs r2, r2, lsl #2 1.109 + strcs r1, [r0], #4 1.110 + strhmi lr, [r0], #2 1.111 + 1.112 +.Lfinish: 1.113 + pop {pc} 1.114 + .fnend