The Tor Browser: comparison gfx/skia/trunk/src/opts/memset16

--1:000000000000
+:79f8762fa17b
+/***************************************************************************
+* Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+*
+* Use of this source code is governed by a BSD-style license that can be
+* found in the LICENSE file.
+***************************************************************************/
+/***************************************************************************
+Neon memset: Attempts to do a memset with Neon registers if possible,
+Inputs:
+s: The buffer to write to
+c: The integer data to write to the buffer
+n: The size_t count.
+Outputs:
+***************************************************************************/
+.code 32
+.fpu neon
+.align 4
+.globl memset16_neon
+.func
+memset16_neon:
+cmp             r2, #0
+bxeq            lr
+/* Keep in mind that r2 -- the count argument -- is for the
+* number of 16-bit items to copy.
+*/
+lsl             r2, r2, #1
+push            {r0}
+/* If we have < 8 bytes, just do a quick loop to handle that */
+cmp             r2, #8
+bgt             memset_gt4
+memset_smallcopy_loop:
+strh            r1, [r0], #2
+subs            r2, r2, #2
+bne             memset_smallcopy_loop
+memset_smallcopy_done:
+pop             {r0}
+bx              lr
+memset_gt4:
+/*
+* Duplicate the r1 lowest 16-bits across r1. The idea is to have
+* a register with two 16-bit-values we can copy. We do this by
+* duplicating lowest 16-bits of r1 to upper 16-bits.
+*/
+orr             r1, r1, r1, lsl #16
+/*
+* If we're copying > 64 bytes, then we may want to get
+* onto a 16-byte boundary to improve speed even more.
+*/
+cmp             r2, #64
+blt             memset_route
+ands            r12, r0, #0xf
+beq             memset_route
+/*
+* Determine the number of bytes to move forward to get to the 16-byte
+* boundary.  Note that this will be a multiple of 4, since we
+* already are word-aligned.
+*/
+rsb             r12, r12, #16
+sub             r2, r2, r12
+lsls            r12, r12, #29
+strmi           r1, [r0], #4
+strcs           r1, [r0], #4
+strcs           r1, [r0], #4
+lsls            r12, r12, #2
+strcsh          r1, [r0], #2
+memset_route:
+/*
+* Decide where to route for the maximum copy sizes.  Note that we
+* build q0 and q1 depending on if we'll need it, so that's
+* interwoven here as well.
+*/
+vdup.u32        d0, r1
+cmp             r2, #16
+blt             memset_8
+vmov            d1, d0
+cmp             r2, #64
+blt             memset_16
+vmov            q1, q0
+cmp             r2, #128
+blt             memset_32
+memset_128:
+mov             r12, r2, lsr #7
+memset_128_loop:
+vst1.64         {q0, q1}, [r0]!
+vst1.64         {q0, q1}, [r0]!
+vst1.64         {q0, q1}, [r0]!
+vst1.64         {q0, q1}, [r0]!
+subs            r12, r12, #1
+bne             memset_128_loop
+ands            r2, r2, #0x7f
+beq             memset_end
+memset_32:
+movs            r12, r2, lsr #5
+beq             memset_16
+memset_32_loop:
+subs            r12, r12, #1
+vst1.64         {q0, q1}, [r0]!
+bne             memset_32_loop
+ands            r2, r2, #0x1f
+beq             memset_end
+memset_16:
+movs            r12, r2, lsr #4
+beq             memset_8
+memset_16_loop:
+subs            r12, r12, #1
+vst1.32         {q0}, [r0]!
+bne             memset_16_loop
+ands            r2, r2, #0xf
+beq             memset_end
+/*
+* memset_8 isn't a loop, since we try to do our loops at 16
+* bytes and above.  We should loop there, then drop down here
+* to finish the <16-byte versions.  Same for memset_4 and
+* memset_1.
+*/
+memset_8:
+cmp             r2, #8
+blt             memset_4
+subs            r2, r2, #8
+vst1.32         {d0}, [r0]!
+memset_4:
+cmp             r2, #4
+blt             memset_2
+subs            r2, r2, #4
+str             r1, [r0], #4
+memset_2:
+cmp             r2, #0
+ble             memset_end
+strh            r1, [r0], #2
+memset_end:
+pop             {r0}
+bx              lr
+.endfunc
+.end

The Tor Browser / file comparison

comparison: gfx/skia/trunk/src/opts/memset16_neon.S

gfx/skia/trunk/src/opts/memset16_neon.S