1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/arm/neon/variance_neon.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,276 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 + EXPORT |vp8_variance16x16_neon| 1.16 + EXPORT |vp8_variance16x8_neon| 1.17 + EXPORT |vp8_variance8x16_neon| 1.18 + EXPORT |vp8_variance8x8_neon| 1.19 + 1.20 + ARM 1.21 + REQUIRE8 1.22 + PRESERVE8 1.23 + 1.24 + AREA ||.text||, CODE, READONLY, ALIGN=2 1.25 + 1.26 +; r0 unsigned char *src_ptr 1.27 +; r1 int source_stride 1.28 +; r2 unsigned char *ref_ptr 1.29 +; r3 int recon_stride 1.30 +; stack unsigned int *sse 1.31 +|vp8_variance16x16_neon| PROC 1.32 + vmov.i8 q8, #0 ;q8 - sum 1.33 + vmov.i8 q9, #0 ;q9, q10 - sse 1.34 + vmov.i8 q10, #0 1.35 + 1.36 + mov r12, #8 1.37 + 1.38 +variance16x16_neon_loop 1.39 + vld1.8 {q0}, [r0], r1 ;Load up source and reference 1.40 + vld1.8 {q2}, [r2], r3 1.41 + vld1.8 {q1}, [r0], r1 1.42 + vld1.8 {q3}, [r2], r3 1.43 + 1.44 + vsubl.u8 q11, d0, d4 ;calculate diff 1.45 + vsubl.u8 q12, d1, d5 1.46 + vsubl.u8 q13, d2, d6 1.47 + vsubl.u8 q14, d3, d7 1.48 + 1.49 + ;VPADAL adds adjacent pairs of elements of a vector, and accumulates 1.50 + ;the results into the elements of the destination vector. The explanation 1.51 + ;in ARM guide is wrong. 1.52 + vpadal.s16 q8, q11 ;calculate sum 1.53 + vmlal.s16 q9, d22, d22 ;calculate sse 1.54 + vmlal.s16 q10, d23, d23 1.55 + 1.56 + subs r12, r12, #1 1.57 + 1.58 + vpadal.s16 q8, q12 1.59 + vmlal.s16 q9, d24, d24 1.60 + vmlal.s16 q10, d25, d25 1.61 + vpadal.s16 q8, q13 1.62 + vmlal.s16 q9, d26, d26 1.63 + vmlal.s16 q10, d27, d27 1.64 + vpadal.s16 q8, q14 1.65 + vmlal.s16 q9, d28, d28 1.66 + vmlal.s16 q10, d29, d29 1.67 + 1.68 + bne variance16x16_neon_loop 1.69 + 1.70 + vadd.u32 q10, q9, q10 ;accumulate sse 1.71 + vpaddl.s32 q0, q8 ;accumulate sum 1.72 + 1.73 + ldr r12, [sp] ;load *sse from stack 1.74 + 1.75 + vpaddl.u32 q1, q10 1.76 + vadd.s64 d0, d0, d1 1.77 + vadd.u64 d1, d2, d3 1.78 + 1.79 + ;vmov.32 r0, d0[0] ;this instruction costs a lot 1.80 + ;vmov.32 r1, d1[0] 1.81 + ;mul r0, r0, r0 1.82 + ;str r1, [r12] 1.83 + ;sub r0, r1, r0, lsr #8 1.84 + 1.85 + ; while sum is signed, sum * sum is always positive and must be treated as 1.86 + ; unsigned to avoid propagating the sign bit. 1.87 + vmull.s32 q5, d0, d0 1.88 + vst1.32 {d1[0]}, [r12] ;store sse 1.89 + vshr.u32 d10, d10, #8 1.90 + vsub.u32 d0, d1, d10 1.91 + 1.92 + vmov.32 r0, d0[0] ;return 1.93 + bx lr 1.94 + 1.95 + ENDP 1.96 + 1.97 +;================================ 1.98 +;unsigned int vp8_variance16x8_c( 1.99 +; unsigned char *src_ptr, 1.100 +; int source_stride, 1.101 +; unsigned char *ref_ptr, 1.102 +; int recon_stride, 1.103 +; unsigned int *sse) 1.104 +|vp8_variance16x8_neon| PROC 1.105 + vmov.i8 q8, #0 ;q8 - sum 1.106 + vmov.i8 q9, #0 ;q9, q10 - sse 1.107 + vmov.i8 q10, #0 1.108 + 1.109 + mov r12, #4 1.110 + 1.111 +variance16x8_neon_loop 1.112 + vld1.8 {q0}, [r0], r1 ;Load up source and reference 1.113 + vld1.8 {q2}, [r2], r3 1.114 + vld1.8 {q1}, [r0], r1 1.115 + vld1.8 {q3}, [r2], r3 1.116 + 1.117 + vsubl.u8 q11, d0, d4 ;calculate diff 1.118 + vsubl.u8 q12, d1, d5 1.119 + vsubl.u8 q13, d2, d6 1.120 + vsubl.u8 q14, d3, d7 1.121 + 1.122 + vpadal.s16 q8, q11 ;calculate sum 1.123 + vmlal.s16 q9, d22, d22 ;calculate sse 1.124 + vmlal.s16 q10, d23, d23 1.125 + 1.126 + subs r12, r12, #1 1.127 + 1.128 + vpadal.s16 q8, q12 1.129 + vmlal.s16 q9, d24, d24 1.130 + vmlal.s16 q10, d25, d25 1.131 + vpadal.s16 q8, q13 1.132 + vmlal.s16 q9, d26, d26 1.133 + vmlal.s16 q10, d27, d27 1.134 + vpadal.s16 q8, q14 1.135 + vmlal.s16 q9, d28, d28 1.136 + vmlal.s16 q10, d29, d29 1.137 + 1.138 + bne variance16x8_neon_loop 1.139 + 1.140 + vadd.u32 q10, q9, q10 ;accumulate sse 1.141 + vpaddl.s32 q0, q8 ;accumulate sum 1.142 + 1.143 + ldr r12, [sp] ;load *sse from stack 1.144 + 1.145 + vpaddl.u32 q1, q10 1.146 + vadd.s64 d0, d0, d1 1.147 + vadd.u64 d1, d2, d3 1.148 + 1.149 + vmull.s32 q5, d0, d0 1.150 + vst1.32 {d1[0]}, [r12] ;store sse 1.151 + vshr.u32 d10, d10, #7 1.152 + vsub.u32 d0, d1, d10 1.153 + 1.154 + vmov.32 r0, d0[0] ;return 1.155 + bx lr 1.156 + 1.157 + ENDP 1.158 + 1.159 +;================================= 1.160 +;unsigned int vp8_variance8x16_c( 1.161 +; unsigned char *src_ptr, 1.162 +; int source_stride, 1.163 +; unsigned char *ref_ptr, 1.164 +; int recon_stride, 1.165 +; unsigned int *sse) 1.166 + 1.167 +|vp8_variance8x16_neon| PROC 1.168 + vmov.i8 q8, #0 ;q8 - sum 1.169 + vmov.i8 q9, #0 ;q9, q10 - sse 1.170 + vmov.i8 q10, #0 1.171 + 1.172 + mov r12, #8 1.173 + 1.174 +variance8x16_neon_loop 1.175 + vld1.8 {d0}, [r0], r1 ;Load up source and reference 1.176 + vld1.8 {d4}, [r2], r3 1.177 + vld1.8 {d2}, [r0], r1 1.178 + vld1.8 {d6}, [r2], r3 1.179 + 1.180 + vsubl.u8 q11, d0, d4 ;calculate diff 1.181 + vsubl.u8 q12, d2, d6 1.182 + 1.183 + vpadal.s16 q8, q11 ;calculate sum 1.184 + vmlal.s16 q9, d22, d22 ;calculate sse 1.185 + vmlal.s16 q10, d23, d23 1.186 + 1.187 + subs r12, r12, #1 1.188 + 1.189 + vpadal.s16 q8, q12 1.190 + vmlal.s16 q9, d24, d24 1.191 + vmlal.s16 q10, d25, d25 1.192 + 1.193 + bne variance8x16_neon_loop 1.194 + 1.195 + vadd.u32 q10, q9, q10 ;accumulate sse 1.196 + vpaddl.s32 q0, q8 ;accumulate sum 1.197 + 1.198 + ldr r12, [sp] ;load *sse from stack 1.199 + 1.200 + vpaddl.u32 q1, q10 1.201 + vadd.s64 d0, d0, d1 1.202 + vadd.u64 d1, d2, d3 1.203 + 1.204 + vmull.s32 q5, d0, d0 1.205 + vst1.32 {d1[0]}, [r12] ;store sse 1.206 + vshr.u32 d10, d10, #7 1.207 + vsub.u32 d0, d1, d10 1.208 + 1.209 + vmov.32 r0, d0[0] ;return 1.210 + bx lr 1.211 + 1.212 + ENDP 1.213 + 1.214 +;================================== 1.215 +; r0 unsigned char *src_ptr 1.216 +; r1 int source_stride 1.217 +; r2 unsigned char *ref_ptr 1.218 +; r3 int recon_stride 1.219 +; stack unsigned int *sse 1.220 +|vp8_variance8x8_neon| PROC 1.221 + vmov.i8 q8, #0 ;q8 - sum 1.222 + vmov.i8 q9, #0 ;q9, q10 - sse 1.223 + vmov.i8 q10, #0 1.224 + 1.225 + mov r12, #2 1.226 + 1.227 +variance8x8_neon_loop 1.228 + vld1.8 {d0}, [r0], r1 ;Load up source and reference 1.229 + vld1.8 {d4}, [r2], r3 1.230 + vld1.8 {d1}, [r0], r1 1.231 + vld1.8 {d5}, [r2], r3 1.232 + vld1.8 {d2}, [r0], r1 1.233 + vld1.8 {d6}, [r2], r3 1.234 + vld1.8 {d3}, [r0], r1 1.235 + vld1.8 {d7}, [r2], r3 1.236 + 1.237 + vsubl.u8 q11, d0, d4 ;calculate diff 1.238 + vsubl.u8 q12, d1, d5 1.239 + vsubl.u8 q13, d2, d6 1.240 + vsubl.u8 q14, d3, d7 1.241 + 1.242 + vpadal.s16 q8, q11 ;calculate sum 1.243 + vmlal.s16 q9, d22, d22 ;calculate sse 1.244 + vmlal.s16 q10, d23, d23 1.245 + 1.246 + subs r12, r12, #1 1.247 + 1.248 + vpadal.s16 q8, q12 1.249 + vmlal.s16 q9, d24, d24 1.250 + vmlal.s16 q10, d25, d25 1.251 + vpadal.s16 q8, q13 1.252 + vmlal.s16 q9, d26, d26 1.253 + vmlal.s16 q10, d27, d27 1.254 + vpadal.s16 q8, q14 1.255 + vmlal.s16 q9, d28, d28 1.256 + vmlal.s16 q10, d29, d29 1.257 + 1.258 + bne variance8x8_neon_loop 1.259 + 1.260 + vadd.u32 q10, q9, q10 ;accumulate sse 1.261 + vpaddl.s32 q0, q8 ;accumulate sum 1.262 + 1.263 + ldr r12, [sp] ;load *sse from stack 1.264 + 1.265 + vpaddl.u32 q1, q10 1.266 + vadd.s64 d0, d0, d1 1.267 + vadd.u64 d1, d2, d3 1.268 + 1.269 + vmull.s32 q5, d0, d0 1.270 + vst1.32 {d1[0]}, [r12] ;store sse 1.271 + vshr.u32 d10, d10, #6 1.272 + vsub.u32 d0, d1, d10 1.273 + 1.274 + vmov.32 r0, d0[0] ;return 1.275 + bx lr 1.276 + 1.277 + ENDP 1.278 + 1.279 + END