diff -r 000000000000 -r 6474c204b198 gfx/skia/trunk/src/opts/memset.arm.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gfx/skia/trunk/src/opts/memset.arm.S	Wed Dec 31 06:09:35 2014 +0100
@@ -0,0 +1,111 @@
+/*
+ * Copyright 2010 The Android Open Source Project
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+/* Changes:
+ * 2010-08-11 Steve McIntyre <steve.mcintyre@arm.com>
+ *    Added small changes to the two functions to make them work on the
+ *    specified number of 16- or 32-bit values rather than the original
+ *    code which was specified as a count of bytes. More verbose comments
+ *    to aid future maintenance.
+ */
+
+    .text
+    .align 4
+    .syntax unified
+
+    .global arm_memset32
+    .type   arm_memset32, %function
+    .global arm_memset16
+    .type   arm_memset16, %function
+
+/*
+ * Optimized memset functions for ARM.
+ *
+ * void arm_memset16(uint16_t* dst, uint16_t value, int count);
+ * void arm_memset32(uint32_t* dst, uint32_t value, int count);
+ *
+ */
+arm_memset16:
+        .fnstart
+        push        {lr}
+
+        /* if count is equal to zero then abort */
+        teq         r2, #0
+        ble         .Lfinish
+
+        /* Multiply count by 2 - go from the number of 16-bit shorts
+         * to the number of bytes desired. */
+        mov         r2, r2, lsl #1
+
+        /* expand the data to 32 bits */
+        orr         r1, r1, r1, lsl #16
+
+        /* align to 32 bits */
+        tst         r0, #2
+        strhne      r1, [r0], #2
+        subne       r2, r2, #2
+
+        /* Now jump into the main loop below. */
+        b           .Lwork_32
+        .fnend
+
+arm_memset32:
+        .fnstart
+        push        {lr}
+
+        /* if count is equal to zero then abort */
+        teq         r2, #0
+        ble         .Lfinish
+
+        /* Multiply count by 4 - go from the number of 32-bit words to
+         * the number of bytes desired. */
+        mov         r2, r2, lsl #2
+
+.Lwork_32:
+        /* Set up registers ready for writing them out. */
+        mov         ip, r1
+        mov         lr, r1
+
+        /* Try to align the destination to a cache line. Assume 32
+         * byte (8 word) cache lines, it's the common case. */
+        rsb         r3, r0, #0
+        ands        r3, r3, #0x1C
+        beq         .Laligned32
+        cmp         r3, r2
+        andhi       r3, r2, #0x1C
+        sub         r2, r2, r3
+
+        /* (Optionally) write any unaligned leading bytes.
+         * (0-28 bytes, length in r3) */
+        movs        r3, r3, lsl #28
+        stmiacs     r0!, {r1, lr}
+        stmiacs     r0!, {r1, lr}
+        stmiami     r0!, {r1, lr}
+        movs        r3, r3, lsl #2
+        strcs       r1, [r0], #4
+
+        /* Now quickly loop through the cache-aligned data. */
+.Laligned32:
+        mov         r3, r1
+1:      subs        r2, r2, #32
+        stmiahs     r0!, {r1,r3,ip,lr}
+        stmiahs     r0!, {r1,r3,ip,lr}
+        bhs         1b
+        add         r2, r2, #32
+
+        /* (Optionally) store any remaining trailing bytes.
+         * (0-30 bytes, length in r2) */
+        movs        r2, r2, lsl #28
+        stmiacs     r0!, {r1,r3,ip,lr}
+        stmiami     r0!, {r1,lr}
+        movs        r2, r2, lsl #2
+        strcs       r1, [r0], #4
+        strhmi      lr, [r0], #2
+
+.Lfinish:
+        pop         {pc}
+        .fnend