|
1 /* |
|
2 * Copyright (c) 2012 |
|
3 * MIPS Technologies, Inc., California. |
|
4 * |
|
5 * Redistribution and use in source and binary forms, with or without |
|
6 * modification, are permitted provided that the following conditions |
|
7 * are met: |
|
8 * 1. Redistributions of source code must retain the above copyright |
|
9 * notice, this list of conditions and the following disclaimer. |
|
10 * 2. Redistributions in binary form must reproduce the above copyright |
|
11 * notice, this list of conditions and the following disclaimer in the |
|
12 * documentation and/or other materials provided with the distribution. |
|
13 * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its |
|
14 * contributors may be used to endorse or promote products derived from |
|
15 * this software without specific prior written permission. |
|
16 * |
|
17 * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND |
|
18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
|
19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
|
20 * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE |
|
21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL |
|
22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS |
|
23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) |
|
24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT |
|
25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY |
|
26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
|
27 * SUCH DAMAGE. |
|
28 */ |
|
29 |
|
30 #include "pixman-mips-dspr2-asm.h" |
|
31 |
|
32 /* |
|
33 * This routine could be optimized for MIPS64. The current code only |
|
34 * uses MIPS32 instructions. |
|
35 */ |
|
36 |
|
37 #ifdef EB |
|
38 # define LWHI lwl /* high part is left in big-endian */ |
|
39 # define SWHI swl /* high part is left in big-endian */ |
|
40 # define LWLO lwr /* low part is right in big-endian */ |
|
41 # define SWLO swr /* low part is right in big-endian */ |
|
42 #else |
|
43 # define LWHI lwr /* high part is right in little-endian */ |
|
44 # define SWHI swr /* high part is right in little-endian */ |
|
45 # define LWLO lwl /* low part is left in big-endian */ |
|
46 # define SWLO swl /* low part is left in big-endian */ |
|
47 #endif |
|
48 |
|
49 LEAF_MIPS32R2(pixman_mips_fast_memcpy) |
|
50 |
|
51 slti AT, a2, 8 |
|
52 bne AT, zero, $last8 |
|
53 move v0, a0 /* memcpy returns the dst pointer */ |
|
54 |
|
55 /* Test if the src and dst are word-aligned, or can be made word-aligned */ |
|
56 xor t8, a1, a0 |
|
57 andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ |
|
58 |
|
59 bne t8, zero, $unaligned |
|
60 negu a3, a0 |
|
61 |
|
62 andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ |
|
63 beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ |
|
64 subu a2, a2, a3 /* now a2 is the remining bytes count */ |
|
65 |
|
66 LWHI t8, 0(a1) |
|
67 addu a1, a1, a3 |
|
68 SWHI t8, 0(a0) |
|
69 addu a0, a0, a3 |
|
70 |
|
71 /* Now the dst/src are mutually word-aligned with word-aligned addresses */ |
|
72 $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ |
|
73 /* t8 is the byte count after 64-byte chunks */ |
|
74 |
|
75 beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */ |
|
76 /* There will be at most 1 32-byte chunk after it */ |
|
77 subu a3, a2, t8 /* subtract from a2 the reminder */ |
|
78 /* Here a3 counts bytes in 16w chunks */ |
|
79 addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ |
|
80 |
|
81 addu t0, a0, a2 /* t0 is the "past the end" address */ |
|
82 |
|
83 /* |
|
84 * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past |
|
85 * the "t0-32" address |
|
86 * This means: for x=128 the last "safe" a0 address is "t0-160" |
|
87 * Alternatively, for x=64 the last "safe" a0 address is "t0-96" |
|
88 * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit |
|
89 */ |
|
90 subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ |
|
91 |
|
92 pref 0, 0(a1) /* bring the first line of src, addr 0 */ |
|
93 pref 0, 32(a1) /* bring the second line of src, addr 32 */ |
|
94 pref 0, 64(a1) /* bring the third line of src, addr 64 */ |
|
95 pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ |
|
96 /* In case the a0 > t9 don't use "pref 30" at all */ |
|
97 sgtu v1, a0, t9 |
|
98 bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ |
|
99 nop |
|
100 /* otherwise, start with using pref30 */ |
|
101 pref 30, 64(a0) |
|
102 $loop16w: |
|
103 pref 0, 96(a1) |
|
104 lw t0, 0(a1) |
|
105 bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ |
|
106 lw t1, 4(a1) |
|
107 pref 30, 96(a0) /* continue setting up the dest, addr 96 */ |
|
108 $skip_pref30_96: |
|
109 lw t2, 8(a1) |
|
110 lw t3, 12(a1) |
|
111 lw t4, 16(a1) |
|
112 lw t5, 20(a1) |
|
113 lw t6, 24(a1) |
|
114 lw t7, 28(a1) |
|
115 pref 0, 128(a1) /* bring the next lines of src, addr 128 */ |
|
116 |
|
117 sw t0, 0(a0) |
|
118 sw t1, 4(a0) |
|
119 sw t2, 8(a0) |
|
120 sw t3, 12(a0) |
|
121 sw t4, 16(a0) |
|
122 sw t5, 20(a0) |
|
123 sw t6, 24(a0) |
|
124 sw t7, 28(a0) |
|
125 |
|
126 lw t0, 32(a1) |
|
127 bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ |
|
128 lw t1, 36(a1) |
|
129 pref 30, 128(a0) /* continue setting up the dest, addr 128 */ |
|
130 $skip_pref30_128: |
|
131 lw t2, 40(a1) |
|
132 lw t3, 44(a1) |
|
133 lw t4, 48(a1) |
|
134 lw t5, 52(a1) |
|
135 lw t6, 56(a1) |
|
136 lw t7, 60(a1) |
|
137 pref 0, 160(a1) /* bring the next lines of src, addr 160 */ |
|
138 |
|
139 sw t0, 32(a0) |
|
140 sw t1, 36(a0) |
|
141 sw t2, 40(a0) |
|
142 sw t3, 44(a0) |
|
143 sw t4, 48(a0) |
|
144 sw t5, 52(a0) |
|
145 sw t6, 56(a0) |
|
146 sw t7, 60(a0) |
|
147 |
|
148 addiu a0, a0, 64 /* adding 64 to dest */ |
|
149 sgtu v1, a0, t9 |
|
150 bne a0, a3, $loop16w |
|
151 addiu a1, a1, 64 /* adding 64 to src */ |
|
152 move a2, t8 |
|
153 |
|
154 /* Here we have src and dest word-aligned but less than 64-bytes to go */ |
|
155 |
|
156 $chk8w: |
|
157 pref 0, 0x0(a1) |
|
158 andi t8, a2, 0x1f /* is there a 32-byte chunk? */ |
|
159 /* the t8 is the reminder count past 32-bytes */ |
|
160 beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ |
|
161 nop |
|
162 |
|
163 lw t0, 0(a1) |
|
164 lw t1, 4(a1) |
|
165 lw t2, 8(a1) |
|
166 lw t3, 12(a1) |
|
167 lw t4, 16(a1) |
|
168 lw t5, 20(a1) |
|
169 lw t6, 24(a1) |
|
170 lw t7, 28(a1) |
|
171 addiu a1, a1, 32 |
|
172 |
|
173 sw t0, 0(a0) |
|
174 sw t1, 4(a0) |
|
175 sw t2, 8(a0) |
|
176 sw t3, 12(a0) |
|
177 sw t4, 16(a0) |
|
178 sw t5, 20(a0) |
|
179 sw t6, 24(a0) |
|
180 sw t7, 28(a0) |
|
181 addiu a0, a0, 32 |
|
182 |
|
183 $chk1w: |
|
184 andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ |
|
185 beq a2, t8, $last8 |
|
186 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ |
|
187 addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ |
|
188 |
|
189 /* copying in words (4-byte chunks) */ |
|
190 $wordCopy_loop: |
|
191 lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ |
|
192 addiu a1, a1, 4 |
|
193 addiu a0, a0, 4 |
|
194 bne a0, a3, $wordCopy_loop |
|
195 sw t3, -4(a0) |
|
196 |
|
197 /* For the last (<8) bytes */ |
|
198 $last8: |
|
199 blez a2, leave |
|
200 addu a3, a0, a2 /* a3 is the last dst address */ |
|
201 $last8loop: |
|
202 lb v1, 0(a1) |
|
203 addiu a1, a1, 1 |
|
204 addiu a0, a0, 1 |
|
205 bne a0, a3, $last8loop |
|
206 sb v1, -1(a0) |
|
207 |
|
208 leave: j ra |
|
209 nop |
|
210 |
|
211 /* |
|
212 * UNALIGNED case |
|
213 */ |
|
214 |
|
215 $unaligned: |
|
216 /* got here with a3="negu a0" */ |
|
217 andi a3, a3, 0x3 /* test if the a0 is word aligned */ |
|
218 beqz a3, $ua_chk16w |
|
219 subu a2, a2, a3 /* bytes left after initial a3 bytes */ |
|
220 |
|
221 LWHI v1, 0(a1) |
|
222 LWLO v1, 3(a1) |
|
223 addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ |
|
224 SWHI v1, 0(a0) |
|
225 addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ |
|
226 |
|
227 $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ |
|
228 /* t8 is the byte count after 64-byte chunks */ |
|
229 beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */ |
|
230 /* There will be at most 1 32-byte chunk after it */ |
|
231 subu a3, a2, t8 /* subtract from a2 the reminder */ |
|
232 /* Here a3 counts bytes in 16w chunks */ |
|
233 addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ |
|
234 |
|
235 addu t0, a0, a2 /* t0 is the "past the end" address */ |
|
236 |
|
237 subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ |
|
238 |
|
239 pref 0, 0(a1) /* bring the first line of src, addr 0 */ |
|
240 pref 0, 32(a1) /* bring the second line of src, addr 32 */ |
|
241 pref 0, 64(a1) /* bring the third line of src, addr 64 */ |
|
242 pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ |
|
243 /* In case the a0 > t9 don't use "pref 30" at all */ |
|
244 sgtu v1, a0, t9 |
|
245 bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ |
|
246 nop |
|
247 /* otherwise, start with using pref30 */ |
|
248 pref 30, 64(a0) |
|
249 $ua_loop16w: |
|
250 pref 0, 96(a1) |
|
251 LWHI t0, 0(a1) |
|
252 LWLO t0, 3(a1) |
|
253 LWHI t1, 4(a1) |
|
254 bgtz v1, $ua_skip_pref30_96 |
|
255 LWLO t1, 7(a1) |
|
256 pref 30, 96(a0) /* continue setting up the dest, addr 96 */ |
|
257 $ua_skip_pref30_96: |
|
258 LWHI t2, 8(a1) |
|
259 LWLO t2, 11(a1) |
|
260 LWHI t3, 12(a1) |
|
261 LWLO t3, 15(a1) |
|
262 LWHI t4, 16(a1) |
|
263 LWLO t4, 19(a1) |
|
264 LWHI t5, 20(a1) |
|
265 LWLO t5, 23(a1) |
|
266 LWHI t6, 24(a1) |
|
267 LWLO t6, 27(a1) |
|
268 LWHI t7, 28(a1) |
|
269 LWLO t7, 31(a1) |
|
270 pref 0, 128(a1) /* bring the next lines of src, addr 128 */ |
|
271 |
|
272 sw t0, 0(a0) |
|
273 sw t1, 4(a0) |
|
274 sw t2, 8(a0) |
|
275 sw t3, 12(a0) |
|
276 sw t4, 16(a0) |
|
277 sw t5, 20(a0) |
|
278 sw t6, 24(a0) |
|
279 sw t7, 28(a0) |
|
280 |
|
281 LWHI t0, 32(a1) |
|
282 LWLO t0, 35(a1) |
|
283 LWHI t1, 36(a1) |
|
284 bgtz v1, $ua_skip_pref30_128 |
|
285 LWLO t1, 39(a1) |
|
286 pref 30, 128(a0) /* continue setting up the dest, addr 128 */ |
|
287 $ua_skip_pref30_128: |
|
288 LWHI t2, 40(a1) |
|
289 LWLO t2, 43(a1) |
|
290 LWHI t3, 44(a1) |
|
291 LWLO t3, 47(a1) |
|
292 LWHI t4, 48(a1) |
|
293 LWLO t4, 51(a1) |
|
294 LWHI t5, 52(a1) |
|
295 LWLO t5, 55(a1) |
|
296 LWHI t6, 56(a1) |
|
297 LWLO t6, 59(a1) |
|
298 LWHI t7, 60(a1) |
|
299 LWLO t7, 63(a1) |
|
300 pref 0, 160(a1) /* bring the next lines of src, addr 160 */ |
|
301 |
|
302 sw t0, 32(a0) |
|
303 sw t1, 36(a0) |
|
304 sw t2, 40(a0) |
|
305 sw t3, 44(a0) |
|
306 sw t4, 48(a0) |
|
307 sw t5, 52(a0) |
|
308 sw t6, 56(a0) |
|
309 sw t7, 60(a0) |
|
310 |
|
311 addiu a0, a0, 64 /* adding 64 to dest */ |
|
312 sgtu v1, a0, t9 |
|
313 bne a0, a3, $ua_loop16w |
|
314 addiu a1, a1, 64 /* adding 64 to src */ |
|
315 move a2, t8 |
|
316 |
|
317 /* Here we have src and dest word-aligned but less than 64-bytes to go */ |
|
318 |
|
319 $ua_chk8w: |
|
320 pref 0, 0x0(a1) |
|
321 andi t8, a2, 0x1f /* is there a 32-byte chunk? */ |
|
322 /* the t8 is the reminder count */ |
|
323 beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ |
|
324 |
|
325 LWHI t0, 0(a1) |
|
326 LWLO t0, 3(a1) |
|
327 LWHI t1, 4(a1) |
|
328 LWLO t1, 7(a1) |
|
329 LWHI t2, 8(a1) |
|
330 LWLO t2, 11(a1) |
|
331 LWHI t3, 12(a1) |
|
332 LWLO t3, 15(a1) |
|
333 LWHI t4, 16(a1) |
|
334 LWLO t4, 19(a1) |
|
335 LWHI t5, 20(a1) |
|
336 LWLO t5, 23(a1) |
|
337 LWHI t6, 24(a1) |
|
338 LWLO t6, 27(a1) |
|
339 LWHI t7, 28(a1) |
|
340 LWLO t7, 31(a1) |
|
341 addiu a1, a1, 32 |
|
342 |
|
343 sw t0, 0(a0) |
|
344 sw t1, 4(a0) |
|
345 sw t2, 8(a0) |
|
346 sw t3, 12(a0) |
|
347 sw t4, 16(a0) |
|
348 sw t5, 20(a0) |
|
349 sw t6, 24(a0) |
|
350 sw t7, 28(a0) |
|
351 addiu a0, a0, 32 |
|
352 |
|
353 $ua_chk1w: |
|
354 andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ |
|
355 beq a2, t8, $ua_smallCopy |
|
356 subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ |
|
357 addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ |
|
358 |
|
359 /* copying in words (4-byte chunks) */ |
|
360 $ua_wordCopy_loop: |
|
361 LWHI v1, 0(a1) |
|
362 LWLO v1, 3(a1) |
|
363 addiu a1, a1, 4 |
|
364 addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ |
|
365 bne a0, a3, $ua_wordCopy_loop |
|
366 sw v1, -4(a0) |
|
367 |
|
368 /* Now less than 4 bytes (value in a2) left to copy */ |
|
369 $ua_smallCopy: |
|
370 beqz a2, leave |
|
371 addu a3, a0, a2 /* a3 is the last dst address */ |
|
372 $ua_smallCopy_loop: |
|
373 lb v1, 0(a1) |
|
374 addiu a1, a1, 1 |
|
375 addiu a0, a0, 1 |
|
376 bne a0, a3, $ua_smallCopy_loop |
|
377 sb v1, -1(a0) |
|
378 |
|
379 j ra |
|
380 nop |
|
381 |
|
382 END(pixman_mips_fast_memcpy) |