gfx/skia/trunk/src/opts/memset16_neon.S

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

michael@0 1 /***************************************************************************
michael@0 2 * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
michael@0 3 *
michael@0 4 * Use of this source code is governed by a BSD-style license that can be
michael@0 5 * found in the LICENSE file.
michael@0 6 ***************************************************************************/
michael@0 7
michael@0 8 /***************************************************************************
michael@0 9 Neon memset: Attempts to do a memset with Neon registers if possible,
michael@0 10 Inputs:
michael@0 11 s: The buffer to write to
michael@0 12 c: The integer data to write to the buffer
michael@0 13 n: The size_t count.
michael@0 14 Outputs:
michael@0 15
michael@0 16 ***************************************************************************/
michael@0 17
michael@0 18 .code 32
michael@0 19 .fpu neon
michael@0 20 .align 4
michael@0 21 .globl memset16_neon
michael@0 22 .func
michael@0 23
michael@0 24 memset16_neon:
michael@0 25 cmp r2, #0
michael@0 26 bxeq lr
michael@0 27
michael@0 28 /* Keep in mind that r2 -- the count argument -- is for the
michael@0 29 * number of 16-bit items to copy.
michael@0 30 */
michael@0 31 lsl r2, r2, #1
michael@0 32
michael@0 33 push {r0}
michael@0 34
michael@0 35 /* If we have < 8 bytes, just do a quick loop to handle that */
michael@0 36 cmp r2, #8
michael@0 37 bgt memset_gt4
michael@0 38 memset_smallcopy_loop:
michael@0 39 strh r1, [r0], #2
michael@0 40 subs r2, r2, #2
michael@0 41 bne memset_smallcopy_loop
michael@0 42 memset_smallcopy_done:
michael@0 43 pop {r0}
michael@0 44 bx lr
michael@0 45
michael@0 46 memset_gt4:
michael@0 47 /*
michael@0 48 * Duplicate the r1 lowest 16-bits across r1. The idea is to have
michael@0 49 * a register with two 16-bit-values we can copy. We do this by
michael@0 50 * duplicating lowest 16-bits of r1 to upper 16-bits.
michael@0 51 */
michael@0 52 orr r1, r1, r1, lsl #16
michael@0 53 /*
michael@0 54 * If we're copying > 64 bytes, then we may want to get
michael@0 55 * onto a 16-byte boundary to improve speed even more.
michael@0 56 */
michael@0 57 cmp r2, #64
michael@0 58 blt memset_route
michael@0 59 ands r12, r0, #0xf
michael@0 60 beq memset_route
michael@0 61 /*
michael@0 62 * Determine the number of bytes to move forward to get to the 16-byte
michael@0 63 * boundary. Note that this will be a multiple of 4, since we
michael@0 64 * already are word-aligned.
michael@0 65 */
michael@0 66 rsb r12, r12, #16
michael@0 67 sub r2, r2, r12
michael@0 68 lsls r12, r12, #29
michael@0 69 strmi r1, [r0], #4
michael@0 70 strcs r1, [r0], #4
michael@0 71 strcs r1, [r0], #4
michael@0 72 lsls r12, r12, #2
michael@0 73 strcsh r1, [r0], #2
michael@0 74 memset_route:
michael@0 75 /*
michael@0 76 * Decide where to route for the maximum copy sizes. Note that we
michael@0 77 * build q0 and q1 depending on if we'll need it, so that's
michael@0 78 * interwoven here as well.
michael@0 79 */
michael@0 80 vdup.u32 d0, r1
michael@0 81 cmp r2, #16
michael@0 82 blt memset_8
michael@0 83 vmov d1, d0
michael@0 84 cmp r2, #64
michael@0 85 blt memset_16
michael@0 86 vmov q1, q0
michael@0 87 cmp r2, #128
michael@0 88 blt memset_32
michael@0 89 memset_128:
michael@0 90 mov r12, r2, lsr #7
michael@0 91 memset_128_loop:
michael@0 92 vst1.64 {q0, q1}, [r0]!
michael@0 93 vst1.64 {q0, q1}, [r0]!
michael@0 94 vst1.64 {q0, q1}, [r0]!
michael@0 95 vst1.64 {q0, q1}, [r0]!
michael@0 96 subs r12, r12, #1
michael@0 97 bne memset_128_loop
michael@0 98 ands r2, r2, #0x7f
michael@0 99 beq memset_end
michael@0 100 memset_32:
michael@0 101 movs r12, r2, lsr #5
michael@0 102 beq memset_16
michael@0 103 memset_32_loop:
michael@0 104 subs r12, r12, #1
michael@0 105 vst1.64 {q0, q1}, [r0]!
michael@0 106 bne memset_32_loop
michael@0 107 ands r2, r2, #0x1f
michael@0 108 beq memset_end
michael@0 109 memset_16:
michael@0 110 movs r12, r2, lsr #4
michael@0 111 beq memset_8
michael@0 112 memset_16_loop:
michael@0 113 subs r12, r12, #1
michael@0 114 vst1.32 {q0}, [r0]!
michael@0 115 bne memset_16_loop
michael@0 116 ands r2, r2, #0xf
michael@0 117 beq memset_end
michael@0 118 /*
michael@0 119 * memset_8 isn't a loop, since we try to do our loops at 16
michael@0 120 * bytes and above. We should loop there, then drop down here
michael@0 121 * to finish the <16-byte versions. Same for memset_4 and
michael@0 122 * memset_1.
michael@0 123 */
michael@0 124 memset_8:
michael@0 125 cmp r2, #8
michael@0 126 blt memset_4
michael@0 127 subs r2, r2, #8
michael@0 128 vst1.32 {d0}, [r0]!
michael@0 129 memset_4:
michael@0 130 cmp r2, #4
michael@0 131 blt memset_2
michael@0 132 subs r2, r2, #4
michael@0 133 str r1, [r0], #4
michael@0 134 memset_2:
michael@0 135 cmp r2, #0
michael@0 136 ble memset_end
michael@0 137 strh r1, [r0], #2
michael@0 138 memset_end:
michael@0 139 pop {r0}
michael@0 140 bx lr
michael@0 141
michael@0 142 .endfunc
michael@0 143 .end

mercurial