michael@0: /* michael@0: * Copyright (c) 2012 michael@0: * MIPS Technologies, Inc., California. michael@0: * michael@0: * Redistribution and use in source and binary forms, with or without michael@0: * modification, are permitted provided that the following conditions michael@0: * are met: michael@0: * 1. Redistributions of source code must retain the above copyright michael@0: * notice, this list of conditions and the following disclaimer. michael@0: * 2. Redistributions in binary form must reproduce the above copyright michael@0: * notice, this list of conditions and the following disclaimer in the michael@0: * documentation and/or other materials provided with the distribution. michael@0: * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its michael@0: * contributors may be used to endorse or promote products derived from michael@0: * this software without specific prior written permission. michael@0: * michael@0: * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND michael@0: * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE michael@0: * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE michael@0: * ARE DISCLAIMED. IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE michael@0: * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL michael@0: * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS michael@0: * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) michael@0: * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT michael@0: * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY michael@0: * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF michael@0: * SUCH DAMAGE. michael@0: */ michael@0: michael@0: #include "pixman-mips-dspr2-asm.h" michael@0: michael@0: /* michael@0: * This routine could be optimized for MIPS64. The current code only michael@0: * uses MIPS32 instructions. michael@0: */ michael@0: michael@0: #ifdef EB michael@0: # define LWHI lwl /* high part is left in big-endian */ michael@0: # define SWHI swl /* high part is left in big-endian */ michael@0: # define LWLO lwr /* low part is right in big-endian */ michael@0: # define SWLO swr /* low part is right in big-endian */ michael@0: #else michael@0: # define LWHI lwr /* high part is right in little-endian */ michael@0: # define SWHI swr /* high part is right in little-endian */ michael@0: # define LWLO lwl /* low part is left in big-endian */ michael@0: # define SWLO swl /* low part is left in big-endian */ michael@0: #endif michael@0: michael@0: LEAF_MIPS32R2(pixman_mips_fast_memcpy) michael@0: michael@0: slti AT, a2, 8 michael@0: bne AT, zero, $last8 michael@0: move v0, a0 /* memcpy returns the dst pointer */ michael@0: michael@0: /* Test if the src and dst are word-aligned, or can be made word-aligned */ michael@0: xor t8, a1, a0 michael@0: andi t8, t8, 0x3 /* t8 is a0/a1 word-displacement */ michael@0: michael@0: bne t8, zero, $unaligned michael@0: negu a3, a0 michael@0: michael@0: andi a3, a3, 0x3 /* we need to copy a3 bytes to make a0/a1 aligned */ michael@0: beq a3, zero, $chk16w /* when a3=0 then the dst (a0) is word-aligned */ michael@0: subu a2, a2, a3 /* now a2 is the remining bytes count */ michael@0: michael@0: LWHI t8, 0(a1) michael@0: addu a1, a1, a3 michael@0: SWHI t8, 0(a0) michael@0: addu a0, a0, a3 michael@0: michael@0: /* Now the dst/src are mutually word-aligned with word-aligned addresses */ michael@0: $chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ michael@0: /* t8 is the byte count after 64-byte chunks */ michael@0: michael@0: beq a2, t8, $chk8w /* if a2==t8, no 64-byte chunks */ michael@0: /* There will be at most 1 32-byte chunk after it */ michael@0: subu a3, a2, t8 /* subtract from a2 the reminder */ michael@0: /* Here a3 counts bytes in 16w chunks */ michael@0: addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ michael@0: michael@0: addu t0, a0, a2 /* t0 is the "past the end" address */ michael@0: michael@0: /* michael@0: * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past michael@0: * the "t0-32" address michael@0: * This means: for x=128 the last "safe" a0 address is "t0-160" michael@0: * Alternatively, for x=64 the last "safe" a0 address is "t0-96" michael@0: * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit michael@0: */ michael@0: subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ michael@0: michael@0: pref 0, 0(a1) /* bring the first line of src, addr 0 */ michael@0: pref 0, 32(a1) /* bring the second line of src, addr 32 */ michael@0: pref 0, 64(a1) /* bring the third line of src, addr 64 */ michael@0: pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ michael@0: /* In case the a0 > t9 don't use "pref 30" at all */ michael@0: sgtu v1, a0, t9 michael@0: bgtz v1, $loop16w /* skip "pref 30, 64(a0)" for too short arrays */ michael@0: nop michael@0: /* otherwise, start with using pref30 */ michael@0: pref 30, 64(a0) michael@0: $loop16w: michael@0: pref 0, 96(a1) michael@0: lw t0, 0(a1) michael@0: bgtz v1, $skip_pref30_96 /* skip "pref 30, 96(a0)" */ michael@0: lw t1, 4(a1) michael@0: pref 30, 96(a0) /* continue setting up the dest, addr 96 */ michael@0: $skip_pref30_96: michael@0: lw t2, 8(a1) michael@0: lw t3, 12(a1) michael@0: lw t4, 16(a1) michael@0: lw t5, 20(a1) michael@0: lw t6, 24(a1) michael@0: lw t7, 28(a1) michael@0: pref 0, 128(a1) /* bring the next lines of src, addr 128 */ michael@0: michael@0: sw t0, 0(a0) michael@0: sw t1, 4(a0) michael@0: sw t2, 8(a0) michael@0: sw t3, 12(a0) michael@0: sw t4, 16(a0) michael@0: sw t5, 20(a0) michael@0: sw t6, 24(a0) michael@0: sw t7, 28(a0) michael@0: michael@0: lw t0, 32(a1) michael@0: bgtz v1, $skip_pref30_128 /* skip "pref 30, 128(a0)" */ michael@0: lw t1, 36(a1) michael@0: pref 30, 128(a0) /* continue setting up the dest, addr 128 */ michael@0: $skip_pref30_128: michael@0: lw t2, 40(a1) michael@0: lw t3, 44(a1) michael@0: lw t4, 48(a1) michael@0: lw t5, 52(a1) michael@0: lw t6, 56(a1) michael@0: lw t7, 60(a1) michael@0: pref 0, 160(a1) /* bring the next lines of src, addr 160 */ michael@0: michael@0: sw t0, 32(a0) michael@0: sw t1, 36(a0) michael@0: sw t2, 40(a0) michael@0: sw t3, 44(a0) michael@0: sw t4, 48(a0) michael@0: sw t5, 52(a0) michael@0: sw t6, 56(a0) michael@0: sw t7, 60(a0) michael@0: michael@0: addiu a0, a0, 64 /* adding 64 to dest */ michael@0: sgtu v1, a0, t9 michael@0: bne a0, a3, $loop16w michael@0: addiu a1, a1, 64 /* adding 64 to src */ michael@0: move a2, t8 michael@0: michael@0: /* Here we have src and dest word-aligned but less than 64-bytes to go */ michael@0: michael@0: $chk8w: michael@0: pref 0, 0x0(a1) michael@0: andi t8, a2, 0x1f /* is there a 32-byte chunk? */ michael@0: /* the t8 is the reminder count past 32-bytes */ michael@0: beq a2, t8, $chk1w /* when a2=t8, no 32-byte chunk */ michael@0: nop michael@0: michael@0: lw t0, 0(a1) michael@0: lw t1, 4(a1) michael@0: lw t2, 8(a1) michael@0: lw t3, 12(a1) michael@0: lw t4, 16(a1) michael@0: lw t5, 20(a1) michael@0: lw t6, 24(a1) michael@0: lw t7, 28(a1) michael@0: addiu a1, a1, 32 michael@0: michael@0: sw t0, 0(a0) michael@0: sw t1, 4(a0) michael@0: sw t2, 8(a0) michael@0: sw t3, 12(a0) michael@0: sw t4, 16(a0) michael@0: sw t5, 20(a0) michael@0: sw t6, 24(a0) michael@0: sw t7, 28(a0) michael@0: addiu a0, a0, 32 michael@0: michael@0: $chk1w: michael@0: andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ michael@0: beq a2, t8, $last8 michael@0: subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ michael@0: addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ michael@0: michael@0: /* copying in words (4-byte chunks) */ michael@0: $wordCopy_loop: michael@0: lw t3, 0(a1) /* the first t3 may be equal t0 ... optimize? */ michael@0: addiu a1, a1, 4 michael@0: addiu a0, a0, 4 michael@0: bne a0, a3, $wordCopy_loop michael@0: sw t3, -4(a0) michael@0: michael@0: /* For the last (<8) bytes */ michael@0: $last8: michael@0: blez a2, leave michael@0: addu a3, a0, a2 /* a3 is the last dst address */ michael@0: $last8loop: michael@0: lb v1, 0(a1) michael@0: addiu a1, a1, 1 michael@0: addiu a0, a0, 1 michael@0: bne a0, a3, $last8loop michael@0: sb v1, -1(a0) michael@0: michael@0: leave: j ra michael@0: nop michael@0: michael@0: /* michael@0: * UNALIGNED case michael@0: */ michael@0: michael@0: $unaligned: michael@0: /* got here with a3="negu a0" */ michael@0: andi a3, a3, 0x3 /* test if the a0 is word aligned */ michael@0: beqz a3, $ua_chk16w michael@0: subu a2, a2, a3 /* bytes left after initial a3 bytes */ michael@0: michael@0: LWHI v1, 0(a1) michael@0: LWLO v1, 3(a1) michael@0: addu a1, a1, a3 /* a3 may be here 1, 2 or 3 */ michael@0: SWHI v1, 0(a0) michael@0: addu a0, a0, a3 /* below the dst will be word aligned (NOTE1) */ michael@0: michael@0: $ua_chk16w: andi t8, a2, 0x3f /* any whole 64-byte chunks? */ michael@0: /* t8 is the byte count after 64-byte chunks */ michael@0: beq a2, t8, $ua_chk8w /* if a2==t8, no 64-byte chunks */ michael@0: /* There will be at most 1 32-byte chunk after it */ michael@0: subu a3, a2, t8 /* subtract from a2 the reminder */ michael@0: /* Here a3 counts bytes in 16w chunks */ michael@0: addu a3, a0, a3 /* Now a3 is the final dst after 64-byte chunks */ michael@0: michael@0: addu t0, a0, a2 /* t0 is the "past the end" address */ michael@0: michael@0: subu t9, t0, 160 /* t9 is the "last safe pref 30, 128(a0)" address */ michael@0: michael@0: pref 0, 0(a1) /* bring the first line of src, addr 0 */ michael@0: pref 0, 32(a1) /* bring the second line of src, addr 32 */ michael@0: pref 0, 64(a1) /* bring the third line of src, addr 64 */ michael@0: pref 30, 32(a0) /* safe, as we have at least 64 bytes ahead */ michael@0: /* In case the a0 > t9 don't use "pref 30" at all */ michael@0: sgtu v1, a0, t9 michael@0: bgtz v1, $ua_loop16w /* skip "pref 30, 64(a0)" for too short arrays */ michael@0: nop michael@0: /* otherwise, start with using pref30 */ michael@0: pref 30, 64(a0) michael@0: $ua_loop16w: michael@0: pref 0, 96(a1) michael@0: LWHI t0, 0(a1) michael@0: LWLO t0, 3(a1) michael@0: LWHI t1, 4(a1) michael@0: bgtz v1, $ua_skip_pref30_96 michael@0: LWLO t1, 7(a1) michael@0: pref 30, 96(a0) /* continue setting up the dest, addr 96 */ michael@0: $ua_skip_pref30_96: michael@0: LWHI t2, 8(a1) michael@0: LWLO t2, 11(a1) michael@0: LWHI t3, 12(a1) michael@0: LWLO t3, 15(a1) michael@0: LWHI t4, 16(a1) michael@0: LWLO t4, 19(a1) michael@0: LWHI t5, 20(a1) michael@0: LWLO t5, 23(a1) michael@0: LWHI t6, 24(a1) michael@0: LWLO t6, 27(a1) michael@0: LWHI t7, 28(a1) michael@0: LWLO t7, 31(a1) michael@0: pref 0, 128(a1) /* bring the next lines of src, addr 128 */ michael@0: michael@0: sw t0, 0(a0) michael@0: sw t1, 4(a0) michael@0: sw t2, 8(a0) michael@0: sw t3, 12(a0) michael@0: sw t4, 16(a0) michael@0: sw t5, 20(a0) michael@0: sw t6, 24(a0) michael@0: sw t7, 28(a0) michael@0: michael@0: LWHI t0, 32(a1) michael@0: LWLO t0, 35(a1) michael@0: LWHI t1, 36(a1) michael@0: bgtz v1, $ua_skip_pref30_128 michael@0: LWLO t1, 39(a1) michael@0: pref 30, 128(a0) /* continue setting up the dest, addr 128 */ michael@0: $ua_skip_pref30_128: michael@0: LWHI t2, 40(a1) michael@0: LWLO t2, 43(a1) michael@0: LWHI t3, 44(a1) michael@0: LWLO t3, 47(a1) michael@0: LWHI t4, 48(a1) michael@0: LWLO t4, 51(a1) michael@0: LWHI t5, 52(a1) michael@0: LWLO t5, 55(a1) michael@0: LWHI t6, 56(a1) michael@0: LWLO t6, 59(a1) michael@0: LWHI t7, 60(a1) michael@0: LWLO t7, 63(a1) michael@0: pref 0, 160(a1) /* bring the next lines of src, addr 160 */ michael@0: michael@0: sw t0, 32(a0) michael@0: sw t1, 36(a0) michael@0: sw t2, 40(a0) michael@0: sw t3, 44(a0) michael@0: sw t4, 48(a0) michael@0: sw t5, 52(a0) michael@0: sw t6, 56(a0) michael@0: sw t7, 60(a0) michael@0: michael@0: addiu a0, a0, 64 /* adding 64 to dest */ michael@0: sgtu v1, a0, t9 michael@0: bne a0, a3, $ua_loop16w michael@0: addiu a1, a1, 64 /* adding 64 to src */ michael@0: move a2, t8 michael@0: michael@0: /* Here we have src and dest word-aligned but less than 64-bytes to go */ michael@0: michael@0: $ua_chk8w: michael@0: pref 0, 0x0(a1) michael@0: andi t8, a2, 0x1f /* is there a 32-byte chunk? */ michael@0: /* the t8 is the reminder count */ michael@0: beq a2, t8, $ua_chk1w /* when a2=t8, no 32-byte chunk */ michael@0: michael@0: LWHI t0, 0(a1) michael@0: LWLO t0, 3(a1) michael@0: LWHI t1, 4(a1) michael@0: LWLO t1, 7(a1) michael@0: LWHI t2, 8(a1) michael@0: LWLO t2, 11(a1) michael@0: LWHI t3, 12(a1) michael@0: LWLO t3, 15(a1) michael@0: LWHI t4, 16(a1) michael@0: LWLO t4, 19(a1) michael@0: LWHI t5, 20(a1) michael@0: LWLO t5, 23(a1) michael@0: LWHI t6, 24(a1) michael@0: LWLO t6, 27(a1) michael@0: LWHI t7, 28(a1) michael@0: LWLO t7, 31(a1) michael@0: addiu a1, a1, 32 michael@0: michael@0: sw t0, 0(a0) michael@0: sw t1, 4(a0) michael@0: sw t2, 8(a0) michael@0: sw t3, 12(a0) michael@0: sw t4, 16(a0) michael@0: sw t5, 20(a0) michael@0: sw t6, 24(a0) michael@0: sw t7, 28(a0) michael@0: addiu a0, a0, 32 michael@0: michael@0: $ua_chk1w: michael@0: andi a2, t8, 0x3 /* now a2 is the reminder past 1w chunks */ michael@0: beq a2, t8, $ua_smallCopy michael@0: subu a3, t8, a2 /* a3 is count of bytes in 1w chunks */ michael@0: addu a3, a0, a3 /* now a3 is the dst address past the 1w chunks */ michael@0: michael@0: /* copying in words (4-byte chunks) */ michael@0: $ua_wordCopy_loop: michael@0: LWHI v1, 0(a1) michael@0: LWLO v1, 3(a1) michael@0: addiu a1, a1, 4 michael@0: addiu a0, a0, 4 /* note: dst=a0 is word aligned here, see NOTE1 */ michael@0: bne a0, a3, $ua_wordCopy_loop michael@0: sw v1, -4(a0) michael@0: michael@0: /* Now less than 4 bytes (value in a2) left to copy */ michael@0: $ua_smallCopy: michael@0: beqz a2, leave michael@0: addu a3, a0, a2 /* a3 is the last dst address */ michael@0: $ua_smallCopy_loop: michael@0: lb v1, 0(a1) michael@0: addiu a1, a1, 1 michael@0: addiu a0, a0, 1 michael@0: bne a0, a3, $ua_smallCopy_loop michael@0: sb v1, -1(a0) michael@0: michael@0: j ra michael@0: nop michael@0: michael@0: END(pixman_mips_fast_memcpy)