media/libvpx/vp8/common/arm/neon/variance_neon.asm

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libvpx/vp8/common/arm/neon/variance_neon.asm	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,276 @@
     1.4 +;
     1.5 +;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
     1.6 +;
     1.7 +;  Use of this source code is governed by a BSD-style license
     1.8 +;  that can be found in the LICENSE file in the root of the source
     1.9 +;  tree. An additional intellectual property rights grant can be found
    1.10 +;  in the file PATENTS.  All contributing project authors may
    1.11 +;  be found in the AUTHORS file in the root of the source tree.
    1.12 +;
    1.13 +
    1.14 +
    1.15 +    EXPORT  |vp8_variance16x16_neon|
    1.16 +    EXPORT  |vp8_variance16x8_neon|
    1.17 +    EXPORT  |vp8_variance8x16_neon|
    1.18 +    EXPORT  |vp8_variance8x8_neon|
    1.19 +
    1.20 +    ARM
    1.21 +    REQUIRE8
    1.22 +    PRESERVE8
    1.23 +
    1.24 +    AREA ||.text||, CODE, READONLY, ALIGN=2
    1.25 +
    1.26 +; r0    unsigned char *src_ptr
    1.27 +; r1    int source_stride
    1.28 +; r2    unsigned char *ref_ptr
    1.29 +; r3    int  recon_stride
    1.30 +; stack unsigned int *sse
    1.31 +|vp8_variance16x16_neon| PROC
    1.32 +    vmov.i8         q8, #0                      ;q8 - sum
    1.33 +    vmov.i8         q9, #0                      ;q9, q10 - sse
    1.34 +    vmov.i8         q10, #0
    1.35 +
    1.36 +    mov             r12, #8
    1.37 +
    1.38 +variance16x16_neon_loop
    1.39 +    vld1.8          {q0}, [r0], r1              ;Load up source and reference
    1.40 +    vld1.8          {q2}, [r2], r3
    1.41 +    vld1.8          {q1}, [r0], r1
    1.42 +    vld1.8          {q3}, [r2], r3
    1.43 +
    1.44 +    vsubl.u8        q11, d0, d4                 ;calculate diff
    1.45 +    vsubl.u8        q12, d1, d5
    1.46 +    vsubl.u8        q13, d2, d6
    1.47 +    vsubl.u8        q14, d3, d7
    1.48 +
    1.49 +    ;VPADAL adds adjacent pairs of elements of a vector, and accumulates
    1.50 +    ;the results into the elements of the destination vector. The explanation
    1.51 +    ;in ARM guide is wrong.
    1.52 +    vpadal.s16      q8, q11                     ;calculate sum
    1.53 +    vmlal.s16       q9, d22, d22                ;calculate sse
    1.54 +    vmlal.s16       q10, d23, d23
    1.55 +
    1.56 +    subs            r12, r12, #1
    1.57 +
    1.58 +    vpadal.s16      q8, q12
    1.59 +    vmlal.s16       q9, d24, d24
    1.60 +    vmlal.s16       q10, d25, d25
    1.61 +    vpadal.s16      q8, q13
    1.62 +    vmlal.s16       q9, d26, d26
    1.63 +    vmlal.s16       q10, d27, d27
    1.64 +    vpadal.s16      q8, q14
    1.65 +    vmlal.s16       q9, d28, d28
    1.66 +    vmlal.s16       q10, d29, d29
    1.67 +
    1.68 +    bne             variance16x16_neon_loop
    1.69 +
    1.70 +    vadd.u32        q10, q9, q10                ;accumulate sse
    1.71 +    vpaddl.s32      q0, q8                      ;accumulate sum
    1.72 +
    1.73 +    ldr             r12, [sp]                   ;load *sse from stack
    1.74 +
    1.75 +    vpaddl.u32      q1, q10
    1.76 +    vadd.s64        d0, d0, d1
    1.77 +    vadd.u64        d1, d2, d3
    1.78 +
    1.79 +    ;vmov.32        r0, d0[0]                   ;this instruction costs a lot
    1.80 +    ;vmov.32        r1, d1[0]
    1.81 +    ;mul            r0, r0, r0
    1.82 +    ;str            r1, [r12]
    1.83 +    ;sub            r0, r1, r0, lsr #8
    1.84 +
    1.85 +    ; while sum is signed, sum * sum is always positive and must be treated as
    1.86 +    ; unsigned to avoid propagating the sign bit.
    1.87 +    vmull.s32       q5, d0, d0
    1.88 +    vst1.32         {d1[0]}, [r12]              ;store sse
    1.89 +    vshr.u32        d10, d10, #8
    1.90 +    vsub.u32        d0, d1, d10
    1.91 +
    1.92 +    vmov.32         r0, d0[0]                   ;return
    1.93 +    bx              lr
    1.94 +
    1.95 +    ENDP
    1.96 +
    1.97 +;================================
    1.98 +;unsigned int vp8_variance16x8_c(
    1.99 +;    unsigned char *src_ptr,
   1.100 +;    int  source_stride,
   1.101 +;    unsigned char *ref_ptr,
   1.102 +;    int  recon_stride,
   1.103 +;   unsigned int *sse)
   1.104 +|vp8_variance16x8_neon| PROC
   1.105 +    vmov.i8         q8, #0                      ;q8 - sum
   1.106 +    vmov.i8         q9, #0                      ;q9, q10 - sse
   1.107 +    vmov.i8         q10, #0
   1.108 +
   1.109 +    mov             r12, #4
   1.110 +
   1.111 +variance16x8_neon_loop
   1.112 +    vld1.8          {q0}, [r0], r1              ;Load up source and reference
   1.113 +    vld1.8          {q2}, [r2], r3
   1.114 +    vld1.8          {q1}, [r0], r1
   1.115 +    vld1.8          {q3}, [r2], r3
   1.116 +
   1.117 +    vsubl.u8        q11, d0, d4                 ;calculate diff
   1.118 +    vsubl.u8        q12, d1, d5
   1.119 +    vsubl.u8        q13, d2, d6
   1.120 +    vsubl.u8        q14, d3, d7
   1.121 +
   1.122 +    vpadal.s16      q8, q11                     ;calculate sum
   1.123 +    vmlal.s16       q9, d22, d22                ;calculate sse
   1.124 +    vmlal.s16       q10, d23, d23
   1.125 +
   1.126 +    subs            r12, r12, #1
   1.127 +
   1.128 +    vpadal.s16      q8, q12
   1.129 +    vmlal.s16       q9, d24, d24
   1.130 +    vmlal.s16       q10, d25, d25
   1.131 +    vpadal.s16      q8, q13
   1.132 +    vmlal.s16       q9, d26, d26
   1.133 +    vmlal.s16       q10, d27, d27
   1.134 +    vpadal.s16      q8, q14
   1.135 +    vmlal.s16       q9, d28, d28
   1.136 +    vmlal.s16       q10, d29, d29
   1.137 +
   1.138 +    bne             variance16x8_neon_loop
   1.139 +
   1.140 +    vadd.u32        q10, q9, q10                ;accumulate sse
   1.141 +    vpaddl.s32      q0, q8                      ;accumulate sum
   1.142 +
   1.143 +    ldr             r12, [sp]                   ;load *sse from stack
   1.144 +
   1.145 +    vpaddl.u32      q1, q10
   1.146 +    vadd.s64        d0, d0, d1
   1.147 +    vadd.u64        d1, d2, d3
   1.148 +
   1.149 +    vmull.s32       q5, d0, d0
   1.150 +    vst1.32         {d1[0]}, [r12]              ;store sse
   1.151 +    vshr.u32        d10, d10, #7
   1.152 +    vsub.u32        d0, d1, d10
   1.153 +
   1.154 +    vmov.32         r0, d0[0]                   ;return
   1.155 +    bx              lr
   1.156 +
   1.157 +    ENDP
   1.158 +
   1.159 +;=================================
   1.160 +;unsigned int vp8_variance8x16_c(
   1.161 +;    unsigned char *src_ptr,
   1.162 +;    int  source_stride,
   1.163 +;    unsigned char *ref_ptr,
   1.164 +;    int  recon_stride,
   1.165 +;   unsigned int *sse)
   1.166 +
   1.167 +|vp8_variance8x16_neon| PROC
   1.168 +    vmov.i8         q8, #0                      ;q8 - sum
   1.169 +    vmov.i8         q9, #0                      ;q9, q10 - sse
   1.170 +    vmov.i8         q10, #0
   1.171 +
   1.172 +    mov             r12, #8
   1.173 +
   1.174 +variance8x16_neon_loop
   1.175 +    vld1.8          {d0}, [r0], r1              ;Load up source and reference
   1.176 +    vld1.8          {d4}, [r2], r3
   1.177 +    vld1.8          {d2}, [r0], r1
   1.178 +    vld1.8          {d6}, [r2], r3
   1.179 +
   1.180 +    vsubl.u8        q11, d0, d4                 ;calculate diff
   1.181 +    vsubl.u8        q12, d2, d6
   1.182 +
   1.183 +    vpadal.s16      q8, q11                     ;calculate sum
   1.184 +    vmlal.s16       q9, d22, d22                ;calculate sse
   1.185 +    vmlal.s16       q10, d23, d23
   1.186 +
   1.187 +    subs            r12, r12, #1
   1.188 +
   1.189 +    vpadal.s16      q8, q12
   1.190 +    vmlal.s16       q9, d24, d24
   1.191 +    vmlal.s16       q10, d25, d25
   1.192 +
   1.193 +    bne             variance8x16_neon_loop
   1.194 +
   1.195 +    vadd.u32        q10, q9, q10                ;accumulate sse
   1.196 +    vpaddl.s32      q0, q8                      ;accumulate sum
   1.197 +
   1.198 +    ldr             r12, [sp]                   ;load *sse from stack
   1.199 +
   1.200 +    vpaddl.u32      q1, q10
   1.201 +    vadd.s64        d0, d0, d1
   1.202 +    vadd.u64        d1, d2, d3
   1.203 +
   1.204 +    vmull.s32       q5, d0, d0
   1.205 +    vst1.32         {d1[0]}, [r12]              ;store sse
   1.206 +    vshr.u32        d10, d10, #7
   1.207 +    vsub.u32        d0, d1, d10
   1.208 +
   1.209 +    vmov.32         r0, d0[0]                   ;return
   1.210 +    bx              lr
   1.211 +
   1.212 +    ENDP
   1.213 +
   1.214 +;==================================
   1.215 +; r0    unsigned char *src_ptr
   1.216 +; r1    int source_stride
   1.217 +; r2    unsigned char *ref_ptr
   1.218 +; r3    int  recon_stride
   1.219 +; stack unsigned int *sse
   1.220 +|vp8_variance8x8_neon| PROC
   1.221 +    vmov.i8         q8, #0                      ;q8 - sum
   1.222 +    vmov.i8         q9, #0                      ;q9, q10 - sse
   1.223 +    vmov.i8         q10, #0
   1.224 +
   1.225 +    mov             r12, #2
   1.226 +
   1.227 +variance8x8_neon_loop
   1.228 +    vld1.8          {d0}, [r0], r1              ;Load up source and reference
   1.229 +    vld1.8          {d4}, [r2], r3
   1.230 +    vld1.8          {d1}, [r0], r1
   1.231 +    vld1.8          {d5}, [r2], r3
   1.232 +    vld1.8          {d2}, [r0], r1
   1.233 +    vld1.8          {d6}, [r2], r3
   1.234 +    vld1.8          {d3}, [r0], r1
   1.235 +    vld1.8          {d7}, [r2], r3
   1.236 +
   1.237 +    vsubl.u8        q11, d0, d4                 ;calculate diff
   1.238 +    vsubl.u8        q12, d1, d5
   1.239 +    vsubl.u8        q13, d2, d6
   1.240 +    vsubl.u8        q14, d3, d7
   1.241 +
   1.242 +    vpadal.s16      q8, q11                     ;calculate sum
   1.243 +    vmlal.s16       q9, d22, d22                ;calculate sse
   1.244 +    vmlal.s16       q10, d23, d23
   1.245 +
   1.246 +    subs            r12, r12, #1
   1.247 +
   1.248 +    vpadal.s16      q8, q12
   1.249 +    vmlal.s16       q9, d24, d24
   1.250 +    vmlal.s16       q10, d25, d25
   1.251 +    vpadal.s16      q8, q13
   1.252 +    vmlal.s16       q9, d26, d26
   1.253 +    vmlal.s16       q10, d27, d27
   1.254 +    vpadal.s16      q8, q14
   1.255 +    vmlal.s16       q9, d28, d28
   1.256 +    vmlal.s16       q10, d29, d29
   1.257 +
   1.258 +    bne             variance8x8_neon_loop
   1.259 +
   1.260 +    vadd.u32        q10, q9, q10                ;accumulate sse
   1.261 +    vpaddl.s32      q0, q8                      ;accumulate sum
   1.262 +
   1.263 +    ldr             r12, [sp]                   ;load *sse from stack
   1.264 +
   1.265 +    vpaddl.u32      q1, q10
   1.266 +    vadd.s64        d0, d0, d1
   1.267 +    vadd.u64        d1, d2, d3
   1.268 +
   1.269 +    vmull.s32       q5, d0, d0
   1.270 +    vst1.32         {d1[0]}, [r12]              ;store sse
   1.271 +    vshr.u32        d10, d10, #6
   1.272 +    vsub.u32        d0, d1, d10
   1.273 +
   1.274 +    vmov.32         r0, d0[0]                   ;return
   1.275 +    bx              lr
   1.276 +
   1.277 +    ENDP
   1.278 +
   1.279 +    END

mercurial