1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/arm/neon/vp8_mse16x16_neon.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,116 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 + EXPORT |vp8_mse16x16_neon| 1.16 + EXPORT |vp8_get4x4sse_cs_neon| 1.17 + 1.18 + ARM 1.19 + REQUIRE8 1.20 + PRESERVE8 1.21 + 1.22 + AREA ||.text||, CODE, READONLY, ALIGN=2 1.23 +;============================ 1.24 +; r0 unsigned char *src_ptr 1.25 +; r1 int source_stride 1.26 +; r2 unsigned char *ref_ptr 1.27 +; r3 int recon_stride 1.28 +; stack unsigned int *sse 1.29 +;note: in this function, sum is never used. So, we can remove this part of calculation 1.30 +;from vp8_variance(). 1.31 + 1.32 +|vp8_mse16x16_neon| PROC 1.33 + vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse 1.34 + vmov.i8 q8, #0 1.35 + vmov.i8 q9, #0 1.36 + vmov.i8 q10, #0 1.37 + 1.38 + mov r12, #8 1.39 + 1.40 +mse16x16_neon_loop 1.41 + vld1.8 {q0}, [r0], r1 ;Load up source and reference 1.42 + vld1.8 {q2}, [r2], r3 1.43 + vld1.8 {q1}, [r0], r1 1.44 + vld1.8 {q3}, [r2], r3 1.45 + 1.46 + vsubl.u8 q11, d0, d4 1.47 + vsubl.u8 q12, d1, d5 1.48 + vsubl.u8 q13, d2, d6 1.49 + vsubl.u8 q14, d3, d7 1.50 + 1.51 + vmlal.s16 q7, d22, d22 1.52 + vmlal.s16 q8, d23, d23 1.53 + 1.54 + subs r12, r12, #1 1.55 + 1.56 + vmlal.s16 q9, d24, d24 1.57 + vmlal.s16 q10, d25, d25 1.58 + vmlal.s16 q7, d26, d26 1.59 + vmlal.s16 q8, d27, d27 1.60 + vmlal.s16 q9, d28, d28 1.61 + vmlal.s16 q10, d29, d29 1.62 + 1.63 + bne mse16x16_neon_loop 1.64 + 1.65 + vadd.u32 q7, q7, q8 1.66 + vadd.u32 q9, q9, q10 1.67 + 1.68 + ldr r12, [sp] ;load *sse from stack 1.69 + 1.70 + vadd.u32 q10, q7, q9 1.71 + vpaddl.u32 q1, q10 1.72 + vadd.u64 d0, d2, d3 1.73 + 1.74 + vst1.32 {d0[0]}, [r12] 1.75 + vmov.32 r0, d0[0] 1.76 + 1.77 + bx lr 1.78 + 1.79 + ENDP 1.80 + 1.81 + 1.82 +;============================= 1.83 +; r0 unsigned char *src_ptr, 1.84 +; r1 int source_stride, 1.85 +; r2 unsigned char *ref_ptr, 1.86 +; r3 int recon_stride 1.87 +|vp8_get4x4sse_cs_neon| PROC 1.88 + vld1.8 {d0}, [r0], r1 ;Load up source and reference 1.89 + vld1.8 {d4}, [r2], r3 1.90 + vld1.8 {d1}, [r0], r1 1.91 + vld1.8 {d5}, [r2], r3 1.92 + vld1.8 {d2}, [r0], r1 1.93 + vld1.8 {d6}, [r2], r3 1.94 + vld1.8 {d3}, [r0], r1 1.95 + vld1.8 {d7}, [r2], r3 1.96 + 1.97 + vsubl.u8 q11, d0, d4 1.98 + vsubl.u8 q12, d1, d5 1.99 + vsubl.u8 q13, d2, d6 1.100 + vsubl.u8 q14, d3, d7 1.101 + 1.102 + vmull.s16 q7, d22, d22 1.103 + vmull.s16 q8, d24, d24 1.104 + vmull.s16 q9, d26, d26 1.105 + vmull.s16 q10, d28, d28 1.106 + 1.107 + vadd.u32 q7, q7, q8 1.108 + vadd.u32 q9, q9, q10 1.109 + vadd.u32 q9, q7, q9 1.110 + 1.111 + vpaddl.u32 q1, q9 1.112 + vadd.u64 d0, d2, d3 1.113 + 1.114 + vmov.32 r0, d0[0] 1.115 + bx lr 1.116 + 1.117 + ENDP 1.118 + 1.119 + END