gfx/cairo/libpixman/src/pixman-mips-memcpy-asm.S

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /*
     2  * Copyright (c) 2012
     3  *      MIPS Technologies, Inc., California.
     4  *
     5  * Redistribution and use in source and binary forms, with or without
     6  * modification, are permitted provided that the following conditions
     7  * are met:
     8  * 1. Redistributions of source code must retain the above copyright
     9  *    notice, this list of conditions and the following disclaimer.
    10  * 2. Redistributions in binary form must reproduce the above copyright
    11  *    notice, this list of conditions and the following disclaimer in the
    12  *    documentation and/or other materials provided with the distribution.
    13  * 3. Neither the name of the MIPS Technologies, Inc., nor the names of its
    14  *    contributors may be used to endorse or promote products derived from
    15  *    this software without specific prior written permission.
    16  *
    17  * THIS SOFTWARE IS PROVIDED BY THE MIPS TECHNOLOGIES, INC. ``AS IS'' AND
    18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
    19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
    20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE MIPS TECHNOLOGIES, INC. BE LIABLE
    21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
    22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
    23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
    24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
    25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
    26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
    27  * SUCH DAMAGE.
    28  */
    30 #include "pixman-mips-dspr2-asm.h"
    32 /*
    33  * This routine could be optimized for MIPS64. The current code only
    34  * uses MIPS32 instructions.
    35  */
    37 #ifdef EB
    38 #  define LWHI	lwl		/* high part is left in big-endian */
    39 #  define SWHI	swl		/* high part is left in big-endian */
    40 #  define LWLO	lwr		/* low part is right in big-endian */
    41 #  define SWLO	swr		/* low part is right in big-endian */
    42 #else
    43 #  define LWHI	lwr		/* high part is right in little-endian */
    44 #  define SWHI	swr		/* high part is right in little-endian */
    45 #  define LWLO	lwl		/* low part is left in big-endian */
    46 #  define SWLO	swl		/* low part is left in big-endian */
    47 #endif
    49 LEAF_MIPS32R2(pixman_mips_fast_memcpy)
    51 	slti	AT, a2, 8
    52 	bne	AT, zero, $last8
    53 	move	v0, a0	/* memcpy returns the dst pointer */
    55 /* Test if the src and dst are word-aligned, or can be made word-aligned */
    56 	xor	t8, a1, a0
    57 	andi	t8, t8, 0x3		/* t8 is a0/a1 word-displacement */
    59 	bne	t8, zero, $unaligned
    60 	negu	a3, a0
    62 	andi	a3, a3, 0x3	/* we need to copy a3 bytes to make a0/a1 aligned */
    63 	beq	a3, zero, $chk16w	/* when a3=0 then the dst (a0) is word-aligned */
    64 	subu	a2, a2, a3	/* now a2 is the remining bytes count */
    66 	LWHI	t8, 0(a1)
    67 	addu	a1, a1, a3
    68 	SWHI	t8, 0(a0)
    69 	addu	a0, a0, a3
    71 /* Now the dst/src are mutually word-aligned with word-aligned addresses */
    72 $chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
    73 				/* t8 is the byte count after 64-byte chunks */
    75 	beq	a2, t8, $chk8w	/* if a2==t8, no 64-byte chunks */
    76 				/* There will be at most 1 32-byte chunk after it */
    77 	subu	a3, a2, t8	/* subtract from a2 the reminder */
    78                                 /* Here a3 counts bytes in 16w chunks */
    79 	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
    81 	addu	t0, a0, a2	/* t0 is the "past the end" address */
    83 /*
    84  * When in the loop we exercise "pref 30, x(a0)", the a0+x should not be past
    85  * the "t0-32" address
    86  * This means: for x=128 the last "safe" a0 address is "t0-160"
    87  * Alternatively, for x=64 the last "safe" a0 address is "t0-96"
    88  * In the current version we use "pref 30, 128(a0)", so "t0-160" is the limit
    89  */
    90 	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
    92 	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
    93 	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
    94 	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
    95 	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
    96 /* In case the a0 > t9 don't use "pref 30" at all */
    97 	sgtu	v1, a0, t9
    98 	bgtz	v1, $loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
    99 	nop
   100 /* otherwise, start with using pref30 */
   101 	pref	30, 64(a0)
   102 $loop16w:
   103 	pref	0, 96(a1)
   104 	lw	t0, 0(a1)
   105 	bgtz	v1, $skip_pref30_96	/* skip "pref 30, 96(a0)" */
   106 	lw	t1, 4(a1)
   107 	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
   108 $skip_pref30_96:
   109 	lw	t2, 8(a1)
   110 	lw	t3, 12(a1)
   111 	lw	t4, 16(a1)
   112 	lw	t5, 20(a1)
   113 	lw	t6, 24(a1)
   114 	lw	t7, 28(a1)
   115         pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
   117 	sw	t0, 0(a0)
   118 	sw	t1, 4(a0)
   119 	sw	t2, 8(a0)
   120 	sw	t3, 12(a0)
   121 	sw	t4, 16(a0)
   122 	sw	t5, 20(a0)
   123 	sw	t6, 24(a0)
   124 	sw	t7, 28(a0)
   126 	lw	t0, 32(a1)
   127 	bgtz	v1, $skip_pref30_128	/* skip "pref 30, 128(a0)" */
   128 	lw	t1, 36(a1)
   129 	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
   130 $skip_pref30_128:
   131 	lw	t2, 40(a1)
   132 	lw	t3, 44(a1)
   133 	lw	t4, 48(a1)
   134 	lw	t5, 52(a1)
   135 	lw	t6, 56(a1)
   136 	lw	t7, 60(a1)
   137         pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
   139 	sw	t0, 32(a0)
   140 	sw	t1, 36(a0)
   141 	sw	t2, 40(a0)
   142 	sw	t3, 44(a0)
   143 	sw	t4, 48(a0)
   144 	sw	t5, 52(a0)
   145 	sw	t6, 56(a0)
   146 	sw	t7, 60(a0)
   148 	addiu	a0, a0, 64	/* adding 64 to dest */
   149 	sgtu	v1, a0, t9
   150 	bne	a0, a3, $loop16w
   151 	addiu	a1, a1, 64	/* adding 64 to src */
   152 	move	a2, t8
   154 /* Here we have src and dest word-aligned but less than 64-bytes to go */
   156 $chk8w:
   157 	pref 0, 0x0(a1)
   158 	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
   159 				/* the t8 is the reminder count past 32-bytes */
   160 	beq	a2, t8, $chk1w	/* when a2=t8, no 32-byte chunk */
   161 	 nop
   163 	lw	t0, 0(a1)
   164 	lw	t1, 4(a1)
   165 	lw	t2, 8(a1)
   166 	lw	t3, 12(a1)
   167 	lw	t4, 16(a1)
   168 	lw	t5, 20(a1)
   169 	lw	t6, 24(a1)
   170 	lw	t7, 28(a1)
   171 	addiu	a1, a1, 32
   173 	sw	t0, 0(a0)
   174 	sw	t1, 4(a0)
   175 	sw	t2, 8(a0)
   176 	sw	t3, 12(a0)
   177 	sw	t4, 16(a0)
   178 	sw	t5, 20(a0)
   179 	sw	t6, 24(a0)
   180 	sw	t7, 28(a0)
   181 	addiu	a0, a0, 32
   183 $chk1w:
   184 	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
   185 	beq	a2, t8, $last8
   186 	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
   187 	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
   189 /* copying in words (4-byte chunks) */
   190 $wordCopy_loop:
   191 	lw	t3, 0(a1)	/* the first t3 may be equal t0 ... optimize? */
   192 	addiu	a1, a1, 4
   193 	addiu	a0, a0, 4
   194 	bne	a0, a3, $wordCopy_loop
   195 	sw	t3, -4(a0)
   197 /* For the last (<8) bytes */
   198 $last8:
   199 	blez	a2, leave
   200 	addu	a3, a0, a2	/* a3 is the last dst address */
   201 $last8loop:
   202 	lb	v1, 0(a1)
   203 	addiu	a1, a1, 1
   204 	addiu	a0, a0, 1
   205 	bne	a0, a3, $last8loop
   206 	sb	v1, -1(a0)
   208 leave:	j	ra
   209 	nop
   211 /*
   212  * UNALIGNED case
   213  */
   215 $unaligned:
   216 	/* got here with a3="negu a0" */
   217 	andi	a3, a3, 0x3	/* test if the a0 is word aligned */
   218 	beqz	a3, $ua_chk16w
   219 	subu	a2, a2, a3	/* bytes left after initial a3 bytes */
   221 	LWHI	v1, 0(a1)
   222 	LWLO	v1, 3(a1)
   223 	addu	a1, a1, a3	/* a3 may be here 1, 2 or 3 */
   224 	SWHI	v1, 0(a0)
   225 	addu	a0, a0, a3	/* below the dst will be word aligned (NOTE1) */
   227 $ua_chk16w:	andi	t8, a2, 0x3f	/* any whole 64-byte chunks? */
   228 				/* t8 is the byte count after 64-byte chunks */
   229 	beq	a2, t8, $ua_chk8w	/* if a2==t8, no 64-byte chunks */
   230 				/* There will be at most 1 32-byte chunk after it */
   231 	subu	a3, a2, t8	/* subtract from a2 the reminder */
   232                                 /* Here a3 counts bytes in 16w chunks */
   233 	addu	a3, a0, a3	/* Now a3 is the final dst after 64-byte chunks */
   235 	addu	t0, a0, a2	/* t0 is the "past the end" address */
   237 	subu	t9, t0, 160	/* t9 is the "last safe pref 30, 128(a0)" address */
   239 	pref    0, 0(a1)		/* bring the first line of src, addr 0 */
   240 	pref    0, 32(a1)	/* bring the second line of src, addr 32 */
   241 	pref    0, 64(a1)	/* bring the third line of src, addr 64 */
   242 	pref	30, 32(a0)	/* safe, as we have at least 64 bytes ahead */
   243 /* In case the a0 > t9 don't use "pref 30" at all */
   244 	sgtu	v1, a0, t9
   245 	bgtz	v1, $ua_loop16w	/* skip "pref 30, 64(a0)" for too short arrays */
   246 	nop
   247 /* otherwise,  start with using pref30 */
   248 	pref	30, 64(a0)
   249 $ua_loop16w:
   250 	pref	0, 96(a1)
   251 	LWHI	t0, 0(a1)
   252 	LWLO	t0, 3(a1)
   253 	LWHI	t1, 4(a1)
   254 	bgtz	v1, $ua_skip_pref30_96
   255 	LWLO	t1, 7(a1)
   256 	pref    30, 96(a0)   /* continue setting up the dest, addr 96 */
   257 $ua_skip_pref30_96:
   258 	LWHI	t2, 8(a1)
   259 	LWLO	t2, 11(a1)
   260 	LWHI	t3, 12(a1)
   261 	LWLO	t3, 15(a1)
   262 	LWHI	t4, 16(a1)
   263 	LWLO	t4, 19(a1)
   264 	LWHI	t5, 20(a1)
   265 	LWLO	t5, 23(a1)
   266 	LWHI	t6, 24(a1)
   267 	LWLO	t6, 27(a1)
   268 	LWHI	t7, 28(a1)
   269 	LWLO	t7, 31(a1)
   270         pref    0, 128(a1)    /* bring the next lines of src, addr 128 */
   272 	sw	t0, 0(a0)
   273 	sw	t1, 4(a0)
   274 	sw	t2, 8(a0)
   275 	sw	t3, 12(a0)
   276 	sw	t4, 16(a0)
   277 	sw	t5, 20(a0)
   278 	sw	t6, 24(a0)
   279 	sw	t7, 28(a0)
   281 	LWHI	t0, 32(a1)
   282 	LWLO	t0, 35(a1)
   283 	LWHI	t1, 36(a1)
   284 	bgtz	v1, $ua_skip_pref30_128
   285 	LWLO	t1, 39(a1)
   286 	pref    30, 128(a0)   /* continue setting up the dest, addr 128 */
   287 $ua_skip_pref30_128:
   288 	LWHI	t2, 40(a1)
   289 	LWLO	t2, 43(a1)
   290 	LWHI	t3, 44(a1)
   291 	LWLO	t3, 47(a1)
   292 	LWHI	t4, 48(a1)
   293 	LWLO	t4, 51(a1)
   294 	LWHI	t5, 52(a1)
   295 	LWLO	t5, 55(a1)
   296 	LWHI	t6, 56(a1)
   297 	LWLO	t6, 59(a1)
   298 	LWHI	t7, 60(a1)
   299 	LWLO	t7, 63(a1)
   300         pref    0, 160(a1)    /* bring the next lines of src, addr 160 */
   302 	sw	t0, 32(a0)
   303 	sw	t1, 36(a0)
   304 	sw	t2, 40(a0)
   305 	sw	t3, 44(a0)
   306 	sw	t4, 48(a0)
   307 	sw	t5, 52(a0)
   308 	sw	t6, 56(a0)
   309 	sw	t7, 60(a0)
   311 	addiu	a0, a0, 64	/* adding 64 to dest */
   312 	sgtu	v1, a0, t9
   313 	bne	a0, a3, $ua_loop16w
   314 	addiu	a1, a1, 64	/* adding 64 to src */
   315 	move	a2, t8
   317 /* Here we have src and dest word-aligned but less than 64-bytes to go */
   319 $ua_chk8w:
   320 	pref 0, 0x0(a1)
   321 	andi	t8, a2, 0x1f	/* is there a 32-byte chunk? */
   322 				/* the t8 is the reminder count */
   323 	beq	a2, t8, $ua_chk1w	/* when a2=t8, no 32-byte chunk */
   325 	LWHI	t0, 0(a1)
   326 	LWLO	t0, 3(a1)
   327 	LWHI	t1, 4(a1)
   328 	LWLO	t1, 7(a1)
   329 	LWHI	t2, 8(a1)
   330 	LWLO	t2, 11(a1)
   331 	LWHI	t3, 12(a1)
   332 	LWLO	t3, 15(a1)
   333 	LWHI	t4, 16(a1)
   334 	LWLO	t4, 19(a1)
   335 	LWHI	t5, 20(a1)
   336 	LWLO	t5, 23(a1)
   337 	LWHI	t6, 24(a1)
   338 	LWLO	t6, 27(a1)
   339 	LWHI	t7, 28(a1)
   340 	LWLO	t7, 31(a1)
   341 	addiu	a1, a1, 32
   343 	sw	t0, 0(a0)
   344 	sw	t1, 4(a0)
   345 	sw	t2, 8(a0)
   346 	sw	t3, 12(a0)
   347 	sw	t4, 16(a0)
   348 	sw	t5, 20(a0)
   349 	sw	t6, 24(a0)
   350 	sw	t7, 28(a0)
   351 	addiu	a0, a0, 32
   353 $ua_chk1w:
   354 	andi	a2, t8, 0x3	/* now a2 is the reminder past 1w chunks */
   355 	beq	a2, t8, $ua_smallCopy
   356 	subu	a3, t8, a2	/* a3 is count of bytes in 1w chunks */
   357 	addu	a3, a0, a3	/* now a3 is the dst address past the 1w chunks */
   359 /* copying in words (4-byte chunks) */
   360 $ua_wordCopy_loop:
   361 	LWHI	v1, 0(a1)
   362 	LWLO	v1, 3(a1)
   363 	addiu	a1, a1, 4
   364 	addiu	a0, a0, 4		/* note: dst=a0 is word aligned here, see NOTE1 */
   365 	bne	a0, a3, $ua_wordCopy_loop
   366 	sw	v1, -4(a0)
   368 /* Now less than 4 bytes (value in a2) left to copy */
   369 $ua_smallCopy:
   370 	beqz	a2, leave
   371 	addu	a3, a0, a2	/* a3 is the last dst address */
   372 $ua_smallCopy_loop:
   373 	lb	v1, 0(a1)
   374 	addiu	a1, a1, 1
   375 	addiu	a0, a0, 1
   376 	bne	a0, a3, $ua_smallCopy_loop
   377 	sb	v1, -1(a0)
   379 	j	ra
   380 	nop
   382 END(pixman_mips_fast_memcpy)

mercurial