diff -r 000000000000 -r 6474c204b198 gfx/skia/trunk/src/opts/memset16_neon.S
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/gfx/skia/trunk/src/opts/memset16_neon.S	Wed Dec 31 06:09:35 2014 +0100
@@ -0,0 +1,143 @@
+/***************************************************************************
+ * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ ***************************************************************************/
+
+/***************************************************************************
+  Neon memset: Attempts to do a memset with Neon registers if possible,
+     Inputs:
+        s: The buffer to write to
+        c: The integer data to write to the buffer
+        n: The size_t count.
+     Outputs:
+
+***************************************************************************/
+
+        .code 32
+        .fpu neon
+        .align 4
+        .globl memset16_neon
+        .func
+
+memset16_neon:
+        cmp             r2, #0
+        bxeq            lr
+
+        /* Keep in mind that r2 -- the count argument -- is for the
+         * number of 16-bit items to copy.
+         */
+        lsl             r2, r2, #1
+
+        push            {r0}
+
+        /* If we have < 8 bytes, just do a quick loop to handle that */
+        cmp             r2, #8
+        bgt             memset_gt4
+memset_smallcopy_loop:
+        strh            r1, [r0], #2
+        subs            r2, r2, #2
+        bne             memset_smallcopy_loop
+memset_smallcopy_done:
+        pop             {r0}
+        bx              lr
+
+memset_gt4:
+        /*
+         * Duplicate the r1 lowest 16-bits across r1. The idea is to have
+         * a register with two 16-bit-values we can copy. We do this by
+         * duplicating lowest 16-bits of r1 to upper 16-bits.
+         */
+        orr             r1, r1, r1, lsl #16
+        /*
+         * If we're copying > 64 bytes, then we may want to get
+         * onto a 16-byte boundary to improve speed even more.
+         */
+        cmp             r2, #64
+        blt             memset_route
+        ands            r12, r0, #0xf
+        beq             memset_route
+        /*
+         * Determine the number of bytes to move forward to get to the 16-byte
+         * boundary.  Note that this will be a multiple of 4, since we
+         * already are word-aligned.
+         */
+        rsb             r12, r12, #16
+        sub             r2, r2, r12
+        lsls            r12, r12, #29
+        strmi           r1, [r0], #4
+        strcs           r1, [r0], #4
+        strcs           r1, [r0], #4
+        lsls            r12, r12, #2
+        strcsh          r1, [r0], #2
+memset_route:
+        /*
+         * Decide where to route for the maximum copy sizes.  Note that we
+         * build q0 and q1 depending on if we'll need it, so that's
+         * interwoven here as well.
+         */
+        vdup.u32        d0, r1
+        cmp             r2, #16
+        blt             memset_8
+        vmov            d1, d0
+        cmp             r2, #64
+        blt             memset_16
+        vmov            q1, q0
+        cmp             r2, #128
+        blt             memset_32
+memset_128:
+        mov             r12, r2, lsr #7
+memset_128_loop:
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        vst1.64         {q0, q1}, [r0]!
+        subs            r12, r12, #1
+        bne             memset_128_loop
+        ands            r2, r2, #0x7f
+        beq             memset_end
+memset_32:
+        movs            r12, r2, lsr #5
+        beq             memset_16
+memset_32_loop:
+        subs            r12, r12, #1
+        vst1.64         {q0, q1}, [r0]!
+        bne             memset_32_loop
+        ands            r2, r2, #0x1f
+        beq             memset_end
+memset_16:
+        movs            r12, r2, lsr #4
+        beq             memset_8
+memset_16_loop:
+        subs            r12, r12, #1
+        vst1.32         {q0}, [r0]!
+        bne             memset_16_loop
+        ands            r2, r2, #0xf
+        beq             memset_end
+        /*
+         * memset_8 isn't a loop, since we try to do our loops at 16
+         * bytes and above.  We should loop there, then drop down here
+         * to finish the <16-byte versions.  Same for memset_4 and
+         * memset_1.
+         */
+memset_8:
+        cmp             r2, #8
+        blt             memset_4
+        subs            r2, r2, #8
+        vst1.32         {d0}, [r0]!
+memset_4:
+        cmp             r2, #4
+        blt             memset_2
+        subs            r2, r2, #4
+        str             r1, [r0], #4
+memset_2:
+        cmp             r2, #0
+        ble             memset_end
+        strh            r1, [r0], #2
+memset_end:
+        pop             {r0}
+        bx              lr
+
+        .endfunc
+        .end