|
1 /*************************************************************************** |
|
2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved. |
|
3 * |
|
4 * Use of this source code is governed by a BSD-style license that can be |
|
5 * found in the LICENSE file. |
|
6 ***************************************************************************/ |
|
7 |
|
8 /*************************************************************************** |
|
9 Neon memset: Attempts to do a memset with Neon registers if possible, |
|
10 Inputs: |
|
11 s: The buffer to write to |
|
12 c: The integer data to write to the buffer |
|
13 n: The size_t count. |
|
14 Outputs: |
|
15 |
|
16 ***************************************************************************/ |
|
17 |
|
18 .code 32 |
|
19 .fpu neon |
|
20 .align 4 |
|
21 .globl memset16_neon |
|
22 .func |
|
23 |
|
24 memset16_neon: |
|
25 cmp r2, #0 |
|
26 bxeq lr |
|
27 |
|
28 /* Keep in mind that r2 -- the count argument -- is for the |
|
29 * number of 16-bit items to copy. |
|
30 */ |
|
31 lsl r2, r2, #1 |
|
32 |
|
33 push {r0} |
|
34 |
|
35 /* If we have < 8 bytes, just do a quick loop to handle that */ |
|
36 cmp r2, #8 |
|
37 bgt memset_gt4 |
|
38 memset_smallcopy_loop: |
|
39 strh r1, [r0], #2 |
|
40 subs r2, r2, #2 |
|
41 bne memset_smallcopy_loop |
|
42 memset_smallcopy_done: |
|
43 pop {r0} |
|
44 bx lr |
|
45 |
|
46 memset_gt4: |
|
47 /* |
|
48 * Duplicate the r1 lowest 16-bits across r1. The idea is to have |
|
49 * a register with two 16-bit-values we can copy. We do this by |
|
50 * duplicating lowest 16-bits of r1 to upper 16-bits. |
|
51 */ |
|
52 orr r1, r1, r1, lsl #16 |
|
53 /* |
|
54 * If we're copying > 64 bytes, then we may want to get |
|
55 * onto a 16-byte boundary to improve speed even more. |
|
56 */ |
|
57 cmp r2, #64 |
|
58 blt memset_route |
|
59 ands r12, r0, #0xf |
|
60 beq memset_route |
|
61 /* |
|
62 * Determine the number of bytes to move forward to get to the 16-byte |
|
63 * boundary. Note that this will be a multiple of 4, since we |
|
64 * already are word-aligned. |
|
65 */ |
|
66 rsb r12, r12, #16 |
|
67 sub r2, r2, r12 |
|
68 lsls r12, r12, #29 |
|
69 strmi r1, [r0], #4 |
|
70 strcs r1, [r0], #4 |
|
71 strcs r1, [r0], #4 |
|
72 lsls r12, r12, #2 |
|
73 strcsh r1, [r0], #2 |
|
74 memset_route: |
|
75 /* |
|
76 * Decide where to route for the maximum copy sizes. Note that we |
|
77 * build q0 and q1 depending on if we'll need it, so that's |
|
78 * interwoven here as well. |
|
79 */ |
|
80 vdup.u32 d0, r1 |
|
81 cmp r2, #16 |
|
82 blt memset_8 |
|
83 vmov d1, d0 |
|
84 cmp r2, #64 |
|
85 blt memset_16 |
|
86 vmov q1, q0 |
|
87 cmp r2, #128 |
|
88 blt memset_32 |
|
89 memset_128: |
|
90 mov r12, r2, lsr #7 |
|
91 memset_128_loop: |
|
92 vst1.64 {q0, q1}, [r0]! |
|
93 vst1.64 {q0, q1}, [r0]! |
|
94 vst1.64 {q0, q1}, [r0]! |
|
95 vst1.64 {q0, q1}, [r0]! |
|
96 subs r12, r12, #1 |
|
97 bne memset_128_loop |
|
98 ands r2, r2, #0x7f |
|
99 beq memset_end |
|
100 memset_32: |
|
101 movs r12, r2, lsr #5 |
|
102 beq memset_16 |
|
103 memset_32_loop: |
|
104 subs r12, r12, #1 |
|
105 vst1.64 {q0, q1}, [r0]! |
|
106 bne memset_32_loop |
|
107 ands r2, r2, #0x1f |
|
108 beq memset_end |
|
109 memset_16: |
|
110 movs r12, r2, lsr #4 |
|
111 beq memset_8 |
|
112 memset_16_loop: |
|
113 subs r12, r12, #1 |
|
114 vst1.32 {q0}, [r0]! |
|
115 bne memset_16_loop |
|
116 ands r2, r2, #0xf |
|
117 beq memset_end |
|
118 /* |
|
119 * memset_8 isn't a loop, since we try to do our loops at 16 |
|
120 * bytes and above. We should loop there, then drop down here |
|
121 * to finish the <16-byte versions. Same for memset_4 and |
|
122 * memset_1. |
|
123 */ |
|
124 memset_8: |
|
125 cmp r2, #8 |
|
126 blt memset_4 |
|
127 subs r2, r2, #8 |
|
128 vst1.32 {d0}, [r0]! |
|
129 memset_4: |
|
130 cmp r2, #4 |
|
131 blt memset_2 |
|
132 subs r2, r2, #4 |
|
133 str r1, [r0], #4 |
|
134 memset_2: |
|
135 cmp r2, #0 |
|
136 ble memset_end |
|
137 strh r1, [r0], #2 |
|
138 memset_end: |
|
139 pop {r0} |
|
140 bx lr |
|
141 |
|
142 .endfunc |
|
143 .end |