1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/gfx/cairo/libpixman/src/pixman-mips-memcpy-asm.S Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,382 @@ 1.4 +/* 1.5 + * Copyright (c) 2012 1.6 + * MIPS Technologies, Inc., California. 1.7 + * 1.8 + * Redistribution and use in source and binary forms, with or without 1.9 + * modification, are permitted provided that the following conditions 1.10 + * are met: 1.11 + * 1. Redistributions of source code must retain the above copyright 1.12 + * notice, this list of conditions and the following disclaimer. 1.13 + * 2. Redistributions in binary form must reproduce the above copyright 1.14 + * notice, this list of conditions and the following disclaimer in the 1.15 + * documentation and/or other materials provided with the distribution. 1.16 + * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its 1.17 + * contributors may be used to endorse or promote products derived from 1.18 + * this software without specific prior written permission. 1.19 + * 1.20 + * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND 1.21 + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 1.22 + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 1.23 + * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE 1.24 + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 1.25 + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 1.26 + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 1.27 + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 1.28 + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 1.29 + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 1.30 + * SUCH DAMAGE. 1.31 + */ 1.32 + 1.33 +#include "pixman-mips-dspr2-asm.h" 1.34 + 1.35 +/* 1.36 + * This routine could be optimized for MIPS64. The current code only 1.37 + * uses MIPS32 instructions. 1.38 + */ 1.39 + 1.40 +#ifdef EB 1.41 +# define LWHI lwl /* high part is left in big-endian */ 1.42 +# define SWHI swl /* high part is left in big-endian */ 1.43 +# define LWLO lwr /* low part is right in big-endian */ 1.44 +# define SWLO swr /* low part is right in big-endian */ 1.45 +#else 1.46 +# define LWHI lwr /* high part is right in little-endian */ 1.47 +# define SWHI swr /* high part is right in little-endian */ 1.48 +# define LWLO lwl /* low part is left in big-endian */ 1.49 +# define SWLO swl /* low part is left in big-endian */ 1.50 +#endif 1.51 + 1.52 +LEAF_MIPS32R2(pixman_mips_fast_memcpy) 1.53 + 1.54 + slti AT, a2, 8 1.55 + bne AT, zero, $last8 1.56 + move v0, a0 /* memcpy returns the dst pointer */ 1.57 + 1.58 +/* Test if the src and dst are word-aligned, or can be made word-aligned */ 1.59 + xor t8, a1, a0 1.60 + andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ 1.61 + 1.62 + bne t8, zero, $unaligned 1.63 + negu a3, a0 1.64 + 1.65 + andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ 1.66 + beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ 1.67 + subu a2, a2, a3 /* now a2 is the remining bytes count */ 1.68 + 1.69 + LWHI t8, 0(a1) 1.70 + addu a1, a1, a3 1.71 + SWHI t8, 0(a0) 1.72 + addu a0, a0, a3 1.73 + 1.74 +/* Now the dst/src are mutually word-aligned with word-aligned addresses */ 1.75 +$chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ 1.76 + /* t8 is the byte count after 64-byte chunks */ 1.77 + 1.78 + beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */ 1.79 + /* There will be at most 1 32-byte chunk after it */ 1.80 + subu a3, a2, t8 /* subtract from a2 the reminder */ 1.81 + /* Here a3 counts bytes in 16w chunks */ 1.82 + addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ 1.83 + 1.84 + addu t0, a0, a2 /* t0 is the "past the end" address */ 1.85 + 1.86 +/* 1.87 + * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past 1.88 + * the "t0-32" address 1.89 + * This means: for x=128 the last "safe" a0 address is "t0-160" 1.90 + * Alternatively, for x=64 the last "safe" a0 address is "t0-96" 1.91 + * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit 1.92 + */ 1.93 + subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ 1.94 + 1.95 + pref 0, 0(a1) /* bring the first line of src, addr 0 */ 1.96 + pref 0, 32(a1) /* bring the second line of src, addr 32 */ 1.97 + pref 0, 64(a1) /* bring the third line of src, addr 64 */ 1.98 + pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ 1.99 +/* In case the a0 > t9 don't use "pref 30" at all */ 1.100 + sgtu v1, a0, t9 1.101 + bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ 1.102 + nop 1.103 +/* otherwise, start with using pref30 */ 1.104 + pref 30, 64(a0) 1.105 +$loop16w: 1.106 + pref 0, 96(a1) 1.107 + lw t0, 0(a1) 1.108 + bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ 1.109 + lw t1, 4(a1) 1.110 + pref 30, 96(a0) /* continue setting up the dest, addr 96 */ 1.111 +$skip_pref30_96: 1.112 + lw t2, 8(a1) 1.113 + lw t3, 12(a1) 1.114 + lw t4, 16(a1) 1.115 + lw t5, 20(a1) 1.116 + lw t6, 24(a1) 1.117 + lw t7, 28(a1) 1.118 + pref 0, 128(a1) /* bring the next lines of src, addr 128 */ 1.119 + 1.120 + sw t0, 0(a0) 1.121 + sw t1, 4(a0) 1.122 + sw t2, 8(a0) 1.123 + sw t3, 12(a0) 1.124 + sw t4, 16(a0) 1.125 + sw t5, 20(a0) 1.126 + sw t6, 24(a0) 1.127 + sw t7, 28(a0) 1.128 + 1.129 + lw t0, 32(a1) 1.130 + bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ 1.131 + lw t1, 36(a1) 1.132 + pref 30, 128(a0) /* continue setting up the dest, addr 128 */ 1.133 +$skip_pref30_128: 1.134 + lw t2, 40(a1) 1.135 + lw t3, 44(a1) 1.136 + lw t4, 48(a1) 1.137 + lw t5, 52(a1) 1.138 + lw t6, 56(a1) 1.139 + lw t7, 60(a1) 1.140 + pref 0, 160(a1) /* bring the next lines of src, addr 160 */ 1.141 + 1.142 + sw t0, 32(a0) 1.143 + sw t1, 36(a0) 1.144 + sw t2, 40(a0) 1.145 + sw t3, 44(a0) 1.146 + sw t4, 48(a0) 1.147 + sw t5, 52(a0) 1.148 + sw t6, 56(a0) 1.149 + sw t7, 60(a0) 1.150 + 1.151 + addiu a0, a0, 64 /* adding 64 to dest */ 1.152 + sgtu v1, a0, t9 1.153 + bne a0, a3, $loop16w 1.154 + addiu a1, a1, 64 /* adding 64 to src */ 1.155 + move a2, t8 1.156 + 1.157 +/* Here we have src and dest word-aligned but less than 64-bytes to go */ 1.158 + 1.159 +$chk8w: 1.160 + pref 0, 0x0(a1) 1.161 + andi t8, a2, 0x1f /* is there a 32-byte chunk? */ 1.162 + /* the t8 is the reminder count past 32-bytes */ 1.163 + beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ 1.164 + nop 1.165 + 1.166 + lw t0, 0(a1) 1.167 + lw t1, 4(a1) 1.168 + lw t2, 8(a1) 1.169 + lw t3, 12(a1) 1.170 + lw t4, 16(a1) 1.171 + lw t5, 20(a1) 1.172 + lw t6, 24(a1) 1.173 + lw t7, 28(a1) 1.174 + addiu a1, a1, 32 1.175 + 1.176 + sw t0, 0(a0) 1.177 + sw t1, 4(a0) 1.178 + sw t2, 8(a0) 1.179 + sw t3, 12(a0) 1.180 + sw t4, 16(a0) 1.181 + sw t5, 20(a0) 1.182 + sw t6, 24(a0) 1.183 + sw t7, 28(a0) 1.184 + addiu a0, a0, 32 1.185 + 1.186 +$chk1w: 1.187 + andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ 1.188 + beq a2, t8, $last8 1.189 + subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ 1.190 + addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ 1.191 + 1.192 +/* copying in words (4-byte chunks) */ 1.193 +$wordCopy_loop: 1.194 + lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ 1.195 + addiu a1, a1, 4 1.196 + addiu a0, a0, 4 1.197 + bne a0, a3, $wordCopy_loop 1.198 + sw t3, -4(a0) 1.199 + 1.200 +/* For the last (<8) bytes */ 1.201 +$last8: 1.202 + blez a2, leave 1.203 + addu a3, a0, a2 /* a3 is the last dst address */ 1.204 +$last8loop: 1.205 + lb v1, 0(a1) 1.206 + addiu a1, a1, 1 1.207 + addiu a0, a0, 1 1.208 + bne a0, a3, $last8loop 1.209 + sb v1, -1(a0) 1.210 + 1.211 +leave: j ra 1.212 + nop 1.213 + 1.214 +/* 1.215 + * UNALIGNED case 1.216 + */ 1.217 + 1.218 +$unaligned: 1.219 + /* got here with a3="negu a0" */ 1.220 + andi a3, a3, 0x3 /* test if the a0 is word aligned */ 1.221 + beqz a3, $ua_chk16w 1.222 + subu a2, a2, a3 /* bytes left after initial a3 bytes */ 1.223 + 1.224 + LWHI v1, 0(a1) 1.225 + LWLO v1, 3(a1) 1.226 + addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ 1.227 + SWHI v1, 0(a0) 1.228 + addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ 1.229 + 1.230 +$ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ 1.231 + /* t8 is the byte count after 64-byte chunks */ 1.232 + beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */ 1.233 + /* There will be at most 1 32-byte chunk after it */ 1.234 + subu a3, a2, t8 /* subtract from a2 the reminder */ 1.235 + /* Here a3 counts bytes in 16w chunks */ 1.236 + addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ 1.237 + 1.238 + addu t0, a0, a2 /* t0 is the "past the end" address */ 1.239 + 1.240 + subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ 1.241 + 1.242 + pref 0, 0(a1) /* bring the first line of src, addr 0 */ 1.243 + pref 0, 32(a1) /* bring the second line of src, addr 32 */ 1.244 + pref 0, 64(a1) /* bring the third line of src, addr 64 */ 1.245 + pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ 1.246 +/* In case the a0 > t9 don't use "pref 30" at all */ 1.247 + sgtu v1, a0, t9 1.248 + bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ 1.249 + nop 1.250 +/* otherwise, start with using pref30 */ 1.251 + pref 30, 64(a0) 1.252 +$ua_loop16w: 1.253 + pref 0, 96(a1) 1.254 + LWHI t0, 0(a1) 1.255 + LWLO t0, 3(a1) 1.256 + LWHI t1, 4(a1) 1.257 + bgtz v1, $ua_skip_pref30_96 1.258 + LWLO t1, 7(a1) 1.259 + pref 30, 96(a0) /* continue setting up the dest, addr 96 */ 1.260 +$ua_skip_pref30_96: 1.261 + LWHI t2, 8(a1) 1.262 + LWLO t2, 11(a1) 1.263 + LWHI t3, 12(a1) 1.264 + LWLO t3, 15(a1) 1.265 + LWHI t4, 16(a1) 1.266 + LWLO t4, 19(a1) 1.267 + LWHI t5, 20(a1) 1.268 + LWLO t5, 23(a1) 1.269 + LWHI t6, 24(a1) 1.270 + LWLO t6, 27(a1) 1.271 + LWHI t7, 28(a1) 1.272 + LWLO t7, 31(a1) 1.273 + pref 0, 128(a1) /* bring the next lines of src, addr 128 */ 1.274 + 1.275 + sw t0, 0(a0) 1.276 + sw t1, 4(a0) 1.277 + sw t2, 8(a0) 1.278 + sw t3, 12(a0) 1.279 + sw t4, 16(a0) 1.280 + sw t5, 20(a0) 1.281 + sw t6, 24(a0) 1.282 + sw t7, 28(a0) 1.283 + 1.284 + LWHI t0, 32(a1) 1.285 + LWLO t0, 35(a1) 1.286 + LWHI t1, 36(a1) 1.287 + bgtz v1, $ua_skip_pref30_128 1.288 + LWLO t1, 39(a1) 1.289 + pref 30, 128(a0) /* continue setting up the dest, addr 128 */ 1.290 +$ua_skip_pref30_128: 1.291 + LWHI t2, 40(a1) 1.292 + LWLO t2, 43(a1) 1.293 + LWHI t3, 44(a1) 1.294 + LWLO t3, 47(a1) 1.295 + LWHI t4, 48(a1) 1.296 + LWLO t4, 51(a1) 1.297 + LWHI t5, 52(a1) 1.298 + LWLO t5, 55(a1) 1.299 + LWHI t6, 56(a1) 1.300 + LWLO t6, 59(a1) 1.301 + LWHI t7, 60(a1) 1.302 + LWLO t7, 63(a1) 1.303 + pref 0, 160(a1) /* bring the next lines of src, addr 160 */ 1.304 + 1.305 + sw t0, 32(a0) 1.306 + sw t1, 36(a0) 1.307 + sw t2, 40(a0) 1.308 + sw t3, 44(a0) 1.309 + sw t4, 48(a0) 1.310 + sw t5, 52(a0) 1.311 + sw t6, 56(a0) 1.312 + sw t7, 60(a0) 1.313 + 1.314 + addiu a0, a0, 64 /* adding 64 to dest */ 1.315 + sgtu v1, a0, t9 1.316 + bne a0, a3, $ua_loop16w 1.317 + addiu a1, a1, 64 /* adding 64 to src */ 1.318 + move a2, t8 1.319 + 1.320 +/* Here we have src and dest word-aligned but less than 64-bytes to go */ 1.321 + 1.322 +$ua_chk8w: 1.323 + pref 0, 0x0(a1) 1.324 + andi t8, a2, 0x1f /* is there a 32-byte chunk? */ 1.325 + /* the t8 is the reminder count */ 1.326 + beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ 1.327 + 1.328 + LWHI t0, 0(a1) 1.329 + LWLO t0, 3(a1) 1.330 + LWHI t1, 4(a1) 1.331 + LWLO t1, 7(a1) 1.332 + LWHI t2, 8(a1) 1.333 + LWLO t2, 11(a1) 1.334 + LWHI t3, 12(a1) 1.335 + LWLO t3, 15(a1) 1.336 + LWHI t4, 16(a1) 1.337 + LWLO t4, 19(a1) 1.338 + LWHI t5, 20(a1) 1.339 + LWLO t5, 23(a1) 1.340 + LWHI t6, 24(a1) 1.341 + LWLO t6, 27(a1) 1.342 + LWHI t7, 28(a1) 1.343 + LWLO t7, 31(a1) 1.344 + addiu a1, a1, 32 1.345 + 1.346 + sw t0, 0(a0) 1.347 + sw t1, 4(a0) 1.348 + sw t2, 8(a0) 1.349 + sw t3, 12(a0) 1.350 + sw t4, 16(a0) 1.351 + sw t5, 20(a0) 1.352 + sw t6, 24(a0) 1.353 + sw t7, 28(a0) 1.354 + addiu a0, a0, 32 1.355 + 1.356 +$ua_chk1w: 1.357 + andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ 1.358 + beq a2, t8, $ua_smallCopy 1.359 + subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ 1.360 + addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ 1.361 + 1.362 +/* copying in words (4-byte chunks) */ 1.363 +$ua_wordCopy_loop: 1.364 + LWHI v1, 0(a1) 1.365 + LWLO v1, 3(a1) 1.366 + addiu a1, a1, 4 1.367 + addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ 1.368 + bne a0, a3, $ua_wordCopy_loop 1.369 + sw v1, -4(a0) 1.370 + 1.371 +/* Now less than 4 bytes (value in a2) left to copy */ 1.372 +$ua_smallCopy: 1.373 + beqz a2, leave 1.374 + addu a3, a0, a2 /* a3 is the last dst address */ 1.375 +$ua_smallCopy_loop: 1.376 + lb v1, 0(a1) 1.377 + addiu a1, a1, 1 1.378 + addiu a0, a0, 1 1.379 + bne a0, a3, $ua_smallCopy_loop 1.380 + sb v1, -1(a0) 1.381 + 1.382 + j ra 1.383 + nop 1.384 + 1.385 +END(pixman_mips_fast_memcpy)