gfx/skia/trunk/src/opts/memset.arm.S

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/gfx/skia/trunk/src/opts/memset.arm.S	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,111 @@
     1.4 +/*
     1.5 + * Copyright 2010 The Android Open Source Project
     1.6 + *
     1.7 + * Use of this source code is governed by a BSD-style license that can be
     1.8 + * found in the LICENSE file.
     1.9 + */
    1.10 +
    1.11 +/* Changes:
    1.12 + * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com>
    1.13 + *    Added small changes to the two functions to make them work on the
    1.14 + *    specified number of 16- or 32-bit values rather than the original
    1.15 + *    code which was specified as a count of bytes. More verbose comments
    1.16 + *    to aid future maintenance.
    1.17 + */
    1.18 +
    1.19 +    .text
    1.20 +    .align 4
    1.21 +    .syntax unified
    1.22 +
    1.23 +    .global arm_memset32
    1.24 +    .type   arm_memset32, %function
    1.25 +    .global arm_memset16
    1.26 +    .type   arm_memset16, %function
    1.27 +
    1.28 +/*
    1.29 + * Optimized memset functions for ARM.
    1.30 + *
    1.31 + * void arm_memset16(uint16_t* dst, uint16_t value, int count);
    1.32 + * void arm_memset32(uint32_t* dst, uint32_t value, int count);
    1.33 + *
    1.34 + */
    1.35 +arm_memset16:
    1.36 +        .fnstart
    1.37 +        push        {lr}
    1.38 +
    1.39 +        /* if count is equal to zero then abort */
    1.40 +        teq         r2, #0
    1.41 +        ble         .Lfinish
    1.42 +
    1.43 +        /* Multiply count by 2 - go from the number of 16-bit shorts
    1.44 +         * to the number of bytes desired. */
    1.45 +        mov         r2, r2, lsl #1
    1.46 +
    1.47 +        /* expand the data to 32 bits */
    1.48 +        orr         r1, r1, r1, lsl #16
    1.49 +
    1.50 +        /* align to 32 bits */
    1.51 +        tst         r0, #2
    1.52 +        strhne      r1, [r0], #2
    1.53 +        subne       r2, r2, #2
    1.54 +
    1.55 +        /* Now jump into the main loop below. */
    1.56 +        b           .Lwork_32
    1.57 +        .fnend
    1.58 +
    1.59 +arm_memset32:
    1.60 +        .fnstart
    1.61 +        push        {lr}
    1.62 +
    1.63 +        /* if count is equal to zero then abort */
    1.64 +        teq         r2, #0
    1.65 +        ble         .Lfinish
    1.66 +
    1.67 +        /* Multiply count by 4 - go from the number of 32-bit words to
    1.68 +         * the number of bytes desired. */
    1.69 +        mov         r2, r2, lsl #2
    1.70 +
    1.71 +.Lwork_32:
    1.72 +        /* Set up registers ready for writing them out. */
    1.73 +        mov         ip, r1
    1.74 +        mov         lr, r1
    1.75 +
    1.76 +        /* Try to align the destination to a cache line. Assume 32
    1.77 +         * byte (8 word) cache lines, it's the common case. */
    1.78 +        rsb         r3, r0, #0
    1.79 +        ands        r3, r3, #0x1C
    1.80 +        beq         .Laligned32
    1.81 +        cmp         r3, r2
    1.82 +        andhi       r3, r2, #0x1C
    1.83 +        sub         r2, r2, r3
    1.84 +
    1.85 +        /* (Optionally) write any unaligned leading bytes.
    1.86 +         * (0-28 bytes, length in r3) */
    1.87 +        movs        r3, r3, lsl #28
    1.88 +        stmiacs     r0!, {r1, lr}
    1.89 +        stmiacs     r0!, {r1, lr}
    1.90 +        stmiami     r0!, {r1, lr}
    1.91 +        movs        r3, r3, lsl #2
    1.92 +        strcs       r1, [r0], #4
    1.93 +
    1.94 +        /* Now quickly loop through the cache-aligned data. */
    1.95 +.Laligned32:
    1.96 +        mov         r3, r1
    1.97 +1:      subs        r2, r2, #32
    1.98 +        stmiahs     r0!, {r1,r3,ip,lr}
    1.99 +        stmiahs     r0!, {r1,r3,ip,lr}
   1.100 +        bhs         1b
   1.101 +        add         r2, r2, #32
   1.102 +
   1.103 +        /* (Optionally) store any remaining trailing bytes.
   1.104 +         * (0-30 bytes, length in r2) */
   1.105 +        movs        r2, r2, lsl #28
   1.106 +        stmiacs     r0!, {r1,r3,ip,lr}
   1.107 +        stmiami     r0!, {r1,lr}
   1.108 +        movs        r2, r2, lsl #2
   1.109 +        strcs       r1, [r0], #4
   1.110 +        strhmi      lr, [r0], #2
   1.111 +
   1.112 +.Lfinish:
   1.113 +        pop         {pc}
   1.114 +        .fnend

mercurial