gfx/skia/trunk/src/opts/memset16_neon.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/memset16_neon.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,143 @@
     1.4 +/***************************************************************************
     1.5 + * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
     1.6 + *
     1.7 + * Use of this source code is governed by a BSD-style license that can be
     1.8 + * found in the LICENSE file.
     1.9 + ***************************************************************************/
    1.10 +
    1.11 +/***************************************************************************
    1.12 +  Neon memset: Attempts to do a memset with Neon registers if possible,
    1.13 +     Inputs:
    1.14 +        s: The buffer to write to
    1.15 +        c: The integer data to write to the buffer
    1.16 +        n: The size_t count.
    1.17 +     Outputs:
    1.18 +
    1.19 +***************************************************************************/
    1.20 +
    1.21 +        .code 32
    1.22 +        .fpu neon
    1.23 +        .align 4
    1.24 +        .globl memset16_neon
    1.25 +        .func
    1.26 +
    1.27 +memset16_neon:
    1.28 +        cmp             r2, #0
    1.29 +        bxeq            lr
    1.30 +
    1.31 +        /* Keep in mind that r2 -- the count argument -- is for the
    1.32 +         * number of 16-bit items to copy.
    1.33 +         */
    1.34 +        lsl             r2, r2, #1
    1.35 +
    1.36 +        push            {r0}
    1.37 +
    1.38 +        /* If we have < 8 bytes, just do a quick loop to handle that */
    1.39 +        cmp             r2, #8
    1.40 +        bgt             memset_gt4
    1.41 +memset_smallcopy_loop:
    1.42 +        strh            r1, [r0], #2
    1.43 +        subs            r2, r2, #2
    1.44 +        bne             memset_smallcopy_loop
    1.45 +memset_smallcopy_done:
    1.46 +        pop             {r0}
    1.47 +        bx              lr
    1.48 +
    1.49 +memset_gt4:
    1.50 +        /*
    1.51 +         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
    1.52 +         * a register with two 16-bit-values we can copy. We do this by
    1.53 +         * duplicating lowest 16-bits of r1 to upper 16-bits.
    1.54 +         */
    1.55 +        orr             r1, r1, r1, lsl #16
    1.56 +        /*
    1.57 +         * If we're copying > 64 bytes, then we may want to get
    1.58 +         * onto a 16-byte boundary to improve speed even more.
    1.59 +         */
    1.60 +        cmp             r2, #64
    1.61 +        blt             memset_route
    1.62 +        ands            r12, r0, #0xf
    1.63 +        beq             memset_route
    1.64 +        /*
    1.65 +         * Determine the number of bytes to move forward to get to the 16-byte
    1.66 +         * boundary.  Note that this will be a multiple of 4, since we
    1.67 +         * already are word-aligned.
    1.68 +         */
    1.69 +        rsb             r12, r12, #16
    1.70 +        sub             r2, r2, r12
    1.71 +        lsls            r12, r12, #29
    1.72 +        strmi           r1, [r0], #4
    1.73 +        strcs           r1, [r0], #4
    1.74 +        strcs           r1, [r0], #4
    1.75 +        lsls            r12, r12, #2
    1.76 +        strcsh          r1, [r0], #2
    1.77 +memset_route:
    1.78 +        /*
    1.79 +         * Decide where to route for the maximum copy sizes.  Note that we
    1.80 +         * build q0 and q1 depending on if we'll need it, so that's
    1.81 +         * interwoven here as well.
    1.82 +         */
    1.83 +        vdup.u32        d0, r1
    1.84 +        cmp             r2, #16
    1.85 +        blt             memset_8
    1.86 +        vmov            d1, d0
    1.87 +        cmp             r2, #64
    1.88 +        blt             memset_16
    1.89 +        vmov            q1, q0
    1.90 +        cmp             r2, #128
    1.91 +        blt             memset_32
    1.92 +memset_128:
    1.93 +        mov             r12, r2, lsr #7
    1.94 +memset_128_loop:
    1.95 +        vst1.64         {q0, q1}, [r0]!
    1.96 +        vst1.64         {q0, q1}, [r0]!
    1.97 +        vst1.64         {q0, q1}, [r0]!
    1.98 +        vst1.64         {q0, q1}, [r0]!
    1.99 +        subs            r12, r12, #1
   1.100 +        bne             memset_128_loop
   1.101 +        ands            r2, r2, #0x7f
   1.102 +        beq             memset_end
   1.103 +memset_32:
   1.104 +        movs            r12, r2, lsr #5
   1.105 +        beq             memset_16
   1.106 +memset_32_loop:
   1.107 +        subs            r12, r12, #1
   1.108 +        vst1.64         {q0, q1}, [r0]!
   1.109 +        bne             memset_32_loop
   1.110 +        ands            r2, r2, #0x1f
   1.111 +        beq             memset_end
   1.112 +memset_16:
   1.113 +        movs            r12, r2, lsr #4
   1.114 +        beq             memset_8
   1.115 +memset_16_loop:
   1.116 +        subs            r12, r12, #1
   1.117 +        vst1.32         {q0}, [r0]!
   1.118 +        bne             memset_16_loop
   1.119 +        ands            r2, r2, #0xf
   1.120 +        beq             memset_end
   1.121 +        /*
   1.122 +         * memset_8 isn't a loop, since we try to do our loops at 16
   1.123 +         * bytes and above.  We should loop there, then drop down here
   1.124 +         * to finish the <16-byte versions.  Same for memset_4 and
   1.125 +         * memset_1.
   1.126 +         */
   1.127 +memset_8:
   1.128 +        cmp             r2, #8
   1.129 +        blt             memset_4
   1.130 +        subs            r2, r2, #8
   1.131 +        vst1.32         {d0}, [r0]!
   1.132 +memset_4:
   1.133 +        cmp             r2, #4
   1.134 +        blt             memset_2
   1.135 +        subs            r2, r2, #4
   1.136 +        str             r1, [r0], #4
   1.137 +memset_2:
   1.138 +        cmp             r2, #0
   1.139 +        ble             memset_end
   1.140 +        strh            r1, [r0], #2
   1.141 +memset_end:
   1.142 +        pop             {r0}
   1.143 +        bx              lr
   1.144 +
   1.145 +        .endfunc
   1.146 +        .end

mercurial