|
1 /* |
|
2 * Copyright 2010 The Android Open Source Project |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license that can be |
|
5 * found in the LICENSE file. |
|
6 */ |
|
7 |
|
8 /* Changes: |
|
9 * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com> |
|
10 * Added small changes to the two functions to make them work on the |
|
11 * specified number of 16- or 32-bit values rather than the original |
|
12 * code which was specified as a count of bytes. More verbose comments |
|
13 * to aid future maintenance. |
|
14 */ |
|
15 |
|
16 .text |
|
17 .align 4 |
|
18 .syntax unified |
|
19 |
|
20 .global arm_memset32 |
|
21 .type arm_memset32, %function |
|
22 .global arm_memset16 |
|
23 .type arm_memset16, %function |
|
24 |
|
25 /* |
|
26 * Optimized memset functions for ARM. |
|
27 * |
|
28 * void arm_memset16(uint16_t* dst, uint16_t value, int count); |
|
29 * void arm_memset32(uint32_t* dst, uint32_t value, int count); |
|
30 * |
|
31 */ |
|
32 arm_memset16: |
|
33 .fnstart |
|
34 push {lr} |
|
35 |
|
36 /* if count is equal to zero then abort */ |
|
37 teq r2, #0 |
|
38 ble .Lfinish |
|
39 |
|
40 /* Multiply count by 2 - go from the number of 16-bit shorts |
|
41 * to the number of bytes desired. */ |
|
42 mov r2, r2, lsl #1 |
|
43 |
|
44 /* expand the data to 32 bits */ |
|
45 orr r1, r1, r1, lsl #16 |
|
46 |
|
47 /* align to 32 bits */ |
|
48 tst r0, #2 |
|
49 strhne r1, [r0], #2 |
|
50 subne r2, r2, #2 |
|
51 |
|
52 /* Now jump into the main loop below. */ |
|
53 b .Lwork_32 |
|
54 .fnend |
|
55 |
|
56 arm_memset32: |
|
57 .fnstart |
|
58 push {lr} |
|
59 |
|
60 /* if count is equal to zero then abort */ |
|
61 teq r2, #0 |
|
62 ble .Lfinish |
|
63 |
|
64 /* Multiply count by 4 - go from the number of 32-bit words to |
|
65 * the number of bytes desired. */ |
|
66 mov r2, r2, lsl #2 |
|
67 |
|
68 .Lwork_32: |
|
69 /* Set up registers ready for writing them out. */ |
|
70 mov ip, r1 |
|
71 mov lr, r1 |
|
72 |
|
73 /* Try to align the destination to a cache line. Assume 32 |
|
74 * byte (8 word) cache lines, it's the common case. */ |
|
75 rsb r3, r0, #0 |
|
76 ands r3, r3, #0x1C |
|
77 beq .Laligned32 |
|
78 cmp r3, r2 |
|
79 andhi r3, r2, #0x1C |
|
80 sub r2, r2, r3 |
|
81 |
|
82 /* (Optionally) write any unaligned leading bytes. |
|
83 * (0-28 bytes, length in r3) */ |
|
84 movs r3, r3, lsl #28 |
|
85 stmiacs r0!, {r1, lr} |
|
86 stmiacs r0!, {r1, lr} |
|
87 stmiami r0!, {r1, lr} |
|
88 movs r3, r3, lsl #2 |
|
89 strcs r1, [r0], #4 |
|
90 |
|
91 /* Now quickly loop through the cache-aligned data. */ |
|
92 .Laligned32: |
|
93 mov r3, r1 |
|
94 1: subs r2, r2, #32 |
|
95 stmiahs r0!, {r1,r3,ip,lr} |
|
96 stmiahs r0!, {r1,r3,ip,lr} |
|
97 bhs 1b |
|
98 add r2, r2, #32 |
|
99 |
|
100 /* (Optionally) store any remaining trailing bytes. |
|
101 * (0-30 bytes, length in r2) */ |
|
102 movs r2, r2, lsl #28 |
|
103 stmiacs r0!, {r1,r3,ip,lr} |
|
104 stmiami r0!, {r1,lr} |
|
105 movs r2, r2, lsl #2 |
|
106 strcs r1, [r0], #4 |
|
107 strhmi lr, [r0], #2 |
|
108 |
|
109 .Lfinish: |
|
110 pop {pc} |
|
111 .fnend |