gfx/skia/trunk/src/opts/memset16_neon.S

Sat, 03 Jan 2015 20:18:00 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Sat, 03 Jan 2015 20:18:00 +0100
branch
TOR_BUG_3246
changeset 7
129ffea94266
permissions
-rw-r--r--

Conditionally enable double key logic according to:
private browsing mode or privacy.thirdparty.isolate preference and
implement in GetCookieStringCommon and FindCookie where it counts...
With some reservations of how to convince FindCookie users to test
condition and pass a nullptr when disabling double key logic.

     1 /***************************************************************************
     2  * Copyright (c) 2009,2010, Code Aurora Forum. All rights reserved.
     3  *
     4  * Use of this source code is governed by a BSD-style license that can be
     5  * found in the LICENSE file.
     6  ***************************************************************************/
     8 /***************************************************************************
     9   Neon memset: Attempts to do a memset with Neon registers if possible,
    10      Inputs:
    11         s: The buffer to write to
    12         c: The integer data to write to the buffer
    13         n: The size_t count.
    14      Outputs:
    16 ***************************************************************************/
    18         .code 32
    19         .fpu neon
    20         .align 4
    21         .globl memset16_neon
    22         .func
    24 memset16_neon:
    25         cmp             r2, #0
    26         bxeq            lr
    28         /* Keep in mind that r2 -- the count argument -- is for the
    29          * number of 16-bit items to copy.
    30          */
    31         lsl             r2, r2, #1
    33         push            {r0}
    35         /* If we have < 8 bytes, just do a quick loop to handle that */
    36         cmp             r2, #8
    37         bgt             memset_gt4
    38 memset_smallcopy_loop:
    39         strh            r1, [r0], #2
    40         subs            r2, r2, #2
    41         bne             memset_smallcopy_loop
    42 memset_smallcopy_done:
    43         pop             {r0}
    44         bx              lr
    46 memset_gt4:
    47         /*
    48          * Duplicate the r1 lowest 16-bits across r1. The idea is to have
    49          * a register with two 16-bit-values we can copy. We do this by
    50          * duplicating lowest 16-bits of r1 to upper 16-bits.
    51          */
    52         orr             r1, r1, r1, lsl #16
    53         /*
    54          * If we're copying > 64 bytes, then we may want to get
    55          * onto a 16-byte boundary to improve speed even more.
    56          */
    57         cmp             r2, #64
    58         blt             memset_route
    59         ands            r12, r0, #0xf
    60         beq             memset_route
    61         /*
    62          * Determine the number of bytes to move forward to get to the 16-byte
    63          * boundary.  Note that this will be a multiple of 4, since we
    64          * already are word-aligned.
    65          */
    66         rsb             r12, r12, #16
    67         sub             r2, r2, r12
    68         lsls            r12, r12, #29
    69         strmi           r1, [r0], #4
    70         strcs           r1, [r0], #4
    71         strcs           r1, [r0], #4
    72         lsls            r12, r12, #2
    73         strcsh          r1, [r0], #2
    74 memset_route:
    75         /*
    76          * Decide where to route for the maximum copy sizes.  Note that we
    77          * build q0 and q1 depending on if we'll need it, so that's
    78          * interwoven here as well.
    79          */
    80         vdup.u32        d0, r1
    81         cmp             r2, #16
    82         blt             memset_8
    83         vmov            d1, d0
    84         cmp             r2, #64
    85         blt             memset_16
    86         vmov            q1, q0
    87         cmp             r2, #128
    88         blt             memset_32
    89 memset_128:
    90         mov             r12, r2, lsr #7
    91 memset_128_loop:
    92         vst1.64         {q0, q1}, [r0]!
    93         vst1.64         {q0, q1}, [r0]!
    94         vst1.64         {q0, q1}, [r0]!
    95         vst1.64         {q0, q1}, [r0]!
    96         subs            r12, r12, #1
    97         bne             memset_128_loop
    98         ands            r2, r2, #0x7f
    99         beq             memset_end
   100 memset_32:
   101         movs            r12, r2, lsr #5
   102         beq             memset_16
   103 memset_32_loop:
   104         subs            r12, r12, #1
   105         vst1.64         {q0, q1}, [r0]!
   106         bne             memset_32_loop
   107         ands            r2, r2, #0x1f
   108         beq             memset_end
   109 memset_16:
   110         movs            r12, r2, lsr #4
   111         beq             memset_8
   112 memset_16_loop:
   113         subs            r12, r12, #1
   114         vst1.32         {q0}, [r0]!
   115         bne             memset_16_loop
   116         ands            r2, r2, #0xf
   117         beq             memset_end
   118         /*
   119          * memset_8 isn't a loop, since we try to do our loops at 16
   120          * bytes and above.  We should loop there, then drop down here
   121          * to finish the <16-byte versions.  Same for memset_4 and
   122          * memset_1.
   123          */
   124 memset_8:
   125         cmp             r2, #8
   126         blt             memset_4
   127         subs            r2, r2, #8
   128         vst1.32         {d0}, [r0]!
   129 memset_4:
   130         cmp             r2, #4
   131         blt             memset_2
   132         subs            r2, r2, #4
   133         str             r1, [r0], #4
   134 memset_2:
   135         cmp             r2, #0
   136         ble             memset_end
   137         strh            r1, [r0], #2
   138 memset_end:
   139         pop             {r0}
   140         bx              lr
   142         .endfunc
   143         .end

mercurial