media/libvpx/vp8/common/arm/neon/variance_neon.asm

Thu, 15 Jan 2015 15:59:08 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 15 Jan 2015 15:59:08 +0100
branch
TOR_BUG_9701
changeset 10
ac0c01689b40
permissions
-rw-r--r--

Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.

     1 ;
     2 ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     3 ;
     4 ;  Use of this source code is governed by a BSD-style license
     5 ;  that can be found in the LICENSE file in the root of the source
     6 ;  tree. An additional intellectual property rights grant can be found
     7 ;  in the file PATENTS.  All contributing project authors may
     8 ;  be found in the AUTHORS file in the root of the source tree.
     9 ;
    12     EXPORT  |vp8_variance16x16_neon|
    13     EXPORT  |vp8_variance16x8_neon|
    14     EXPORT  |vp8_variance8x16_neon|
    15     EXPORT  |vp8_variance8x8_neon|
    17     ARM
    18     REQUIRE8
    19     PRESERVE8
    21     AREA ||.text||, CODE, READONLY, ALIGN=2
    23 ; r0    unsigned char *src_ptr
    24 ; r1    int source_stride
    25 ; r2    unsigned char *ref_ptr
    26 ; r3    int  recon_stride
    27 ; stack unsigned int *sse
    28 |vp8_variance16x16_neon| PROC
    29     vmov.i8         q8, #0                      ;q8 - sum
    30     vmov.i8         q9, #0                      ;q9, q10 - sse
    31     vmov.i8         q10, #0
    33     mov             r12, #8
    35 variance16x16_neon_loop
    36     vld1.8          {q0}, [r0], r1              ;Load up source and reference
    37     vld1.8          {q2}, [r2], r3
    38     vld1.8          {q1}, [r0], r1
    39     vld1.8          {q3}, [r2], r3
    41     vsubl.u8        q11, d0, d4                 ;calculate diff
    42     vsubl.u8        q12, d1, d5
    43     vsubl.u8        q13, d2, d6
    44     vsubl.u8        q14, d3, d7
    46     ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
    47     ;the results into the elements of the destination vector. The explanation
    48     ;in ARM guide is wrong.
    49     vpadal.s16      q8, q11                     ;calculate sum
    50     vmlal.s16       q9, d22, d22                ;calculate sse
    51     vmlal.s16       q10, d23, d23
    53     subs            r12, r12, #1
    55     vpadal.s16      q8, q12
    56     vmlal.s16       q9, d24, d24
    57     vmlal.s16       q10, d25, d25
    58     vpadal.s16      q8, q13
    59     vmlal.s16       q9, d26, d26
    60     vmlal.s16       q10, d27, d27
    61     vpadal.s16      q8, q14
    62     vmlal.s16       q9, d28, d28
    63     vmlal.s16       q10, d29, d29
    65     bne             variance16x16_neon_loop
    67     vadd.u32        q10, q9, q10                ;accumulate sse
    68     vpaddl.s32      q0, q8                      ;accumulate sum
    70     ldr             r12, [sp]                   ;load *sse from stack
    72     vpaddl.u32      q1, q10
    73     vadd.s64        d0, d0, d1
    74     vadd.u64        d1, d2, d3
    76     ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
    77     ;vmov.32        r1, d1[0]
    78     ;mul            r0, r0, r0
    79     ;str            r1, [r12]
    80     ;sub            r0, r1, r0, lsr #8
    82     ; while sum is signed, sum * sum is always positive and must be treated as
    83     ; unsigned to avoid propagating the sign bit.
    84     vmull.s32       q5, d0, d0
    85     vst1.32         {d1[0]}, [r12]              ;store sse
    86     vshr.u32        d10, d10, #8
    87     vsub.u32        d0, d1, d10
    89     vmov.32         r0, d0[0]                   ;return
    90     bx              lr
    92     ENDP
    94 ;================================
    95 ;unsigned int vp8_variance16x8_c(
    96 ;    unsigned char *src_ptr,
    97 ;    int  source_stride,
    98 ;    unsigned char *ref_ptr,
    99 ;    int  recon_stride,
   100 ;   unsigned int *sse)
   101 |vp8_variance16x8_neon| PROC
   102     vmov.i8         q8, #0                      ;q8 - sum
   103     vmov.i8         q9, #0                      ;q9, q10 - sse
   104     vmov.i8         q10, #0
   106     mov             r12, #4
   108 variance16x8_neon_loop
   109     vld1.8          {q0}, [r0], r1              ;Load up source and reference
   110     vld1.8          {q2}, [r2], r3
   111     vld1.8          {q1}, [r0], r1
   112     vld1.8          {q3}, [r2], r3
   114     vsubl.u8        q11, d0, d4                 ;calculate diff
   115     vsubl.u8        q12, d1, d5
   116     vsubl.u8        q13, d2, d6
   117     vsubl.u8        q14, d3, d7
   119     vpadal.s16      q8, q11                     ;calculate sum
   120     vmlal.s16       q9, d22, d22                ;calculate sse
   121     vmlal.s16       q10, d23, d23
   123     subs            r12, r12, #1
   125     vpadal.s16      q8, q12
   126     vmlal.s16       q9, d24, d24
   127     vmlal.s16       q10, d25, d25
   128     vpadal.s16      q8, q13
   129     vmlal.s16       q9, d26, d26
   130     vmlal.s16       q10, d27, d27
   131     vpadal.s16      q8, q14
   132     vmlal.s16       q9, d28, d28
   133     vmlal.s16       q10, d29, d29
   135     bne             variance16x8_neon_loop
   137     vadd.u32        q10, q9, q10                ;accumulate sse
   138     vpaddl.s32      q0, q8                      ;accumulate sum
   140     ldr             r12, [sp]                   ;load *sse from stack
   142     vpaddl.u32      q1, q10
   143     vadd.s64        d0, d0, d1
   144     vadd.u64        d1, d2, d3
   146     vmull.s32       q5, d0, d0
   147     vst1.32         {d1[0]}, [r12]              ;store sse
   148     vshr.u32        d10, d10, #7
   149     vsub.u32        d0, d1, d10
   151     vmov.32         r0, d0[0]                   ;return
   152     bx              lr
   154     ENDP
   156 ;=================================
   157 ;unsigned int vp8_variance8x16_c(
   158 ;    unsigned char *src_ptr,
   159 ;    int  source_stride,
   160 ;    unsigned char *ref_ptr,
   161 ;    int  recon_stride,
   162 ;   unsigned int *sse)
   164 |vp8_variance8x16_neon| PROC
   165     vmov.i8         q8, #0                      ;q8 - sum
   166     vmov.i8         q9, #0                      ;q9, q10 - sse
   167     vmov.i8         q10, #0
   169     mov             r12, #8
   171 variance8x16_neon_loop
   172     vld1.8          {d0}, [r0], r1              ;Load up source and reference
   173     vld1.8          {d4}, [r2], r3
   174     vld1.8          {d2}, [r0], r1
   175     vld1.8          {d6}, [r2], r3
   177     vsubl.u8        q11, d0, d4                 ;calculate diff
   178     vsubl.u8        q12, d2, d6
   180     vpadal.s16      q8, q11                     ;calculate sum
   181     vmlal.s16       q9, d22, d22                ;calculate sse
   182     vmlal.s16       q10, d23, d23
   184     subs            r12, r12, #1
   186     vpadal.s16      q8, q12
   187     vmlal.s16       q9, d24, d24
   188     vmlal.s16       q10, d25, d25
   190     bne             variance8x16_neon_loop
   192     vadd.u32        q10, q9, q10                ;accumulate sse
   193     vpaddl.s32      q0, q8                      ;accumulate sum
   195     ldr             r12, [sp]                   ;load *sse from stack
   197     vpaddl.u32      q1, q10
   198     vadd.s64        d0, d0, d1
   199     vadd.u64        d1, d2, d3
   201     vmull.s32       q5, d0, d0
   202     vst1.32         {d1[0]}, [r12]              ;store sse
   203     vshr.u32        d10, d10, #7
   204     vsub.u32        d0, d1, d10
   206     vmov.32         r0, d0[0]                   ;return
   207     bx              lr
   209     ENDP
   211 ;==================================
   212 ; r0    unsigned char *src_ptr
   213 ; r1    int source_stride
   214 ; r2    unsigned char *ref_ptr
   215 ; r3    int  recon_stride
   216 ; stack unsigned int *sse
   217 |vp8_variance8x8_neon| PROC
   218     vmov.i8         q8, #0                      ;q8 - sum
   219     vmov.i8         q9, #0                      ;q9, q10 - sse
   220     vmov.i8         q10, #0
   222     mov             r12, #2
   224 variance8x8_neon_loop
   225     vld1.8          {d0}, [r0], r1              ;Load up source and reference
   226     vld1.8          {d4}, [r2], r3
   227     vld1.8          {d1}, [r0], r1
   228     vld1.8          {d5}, [r2], r3
   229     vld1.8          {d2}, [r0], r1
   230     vld1.8          {d6}, [r2], r3
   231     vld1.8          {d3}, [r0], r1
   232     vld1.8          {d7}, [r2], r3
   234     vsubl.u8        q11, d0, d4                 ;calculate diff
   235     vsubl.u8        q12, d1, d5
   236     vsubl.u8        q13, d2, d6
   237     vsubl.u8        q14, d3, d7
   239     vpadal.s16      q8, q11                     ;calculate sum
   240     vmlal.s16       q9, d22, d22                ;calculate sse
   241     vmlal.s16       q10, d23, d23
   243     subs            r12, r12, #1
   245     vpadal.s16      q8, q12
   246     vmlal.s16       q9, d24, d24
   247     vmlal.s16       q10, d25, d25
   248     vpadal.s16      q8, q13
   249     vmlal.s16       q9, d26, d26
   250     vmlal.s16       q10, d27, d27
   251     vpadal.s16      q8, q14
   252     vmlal.s16       q9, d28, d28
   253     vmlal.s16       q10, d29, d29
   255     bne             variance8x8_neon_loop
   257     vadd.u32        q10, q9, q10                ;accumulate sse
   258     vpaddl.s32      q0, q8                      ;accumulate sum
   260     ldr             r12, [sp]                   ;load *sse from stack
   262     vpaddl.u32      q1, q10
   263     vadd.s64        d0, d0, d1
   264     vadd.u64        d1, d2, d3
   266     vmull.s32       q5, d0, d0
   267     vst1.32         {d1[0]}, [r12]              ;store sse
   268     vshr.u32        d10, d10, #6
   269     vsub.u32        d0, d1, d10
   271     vmov.32         r0, d0[0]                   ;return
   272     bx              lr
   274     ENDP
   276     END

mercurial