Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 EXPORT |vp8_mse16x16_neon|
13 EXPORT |vp8_get4x4sse_cs_neon|
15 ARM
16 REQUIRE8
17 PRESERVE8
19 AREA ||.text||, CODE, READONLY, ALIGN=2
20 ;============================
21 ; r0 unsigned char *src_ptr
22 ; r1 int source_stride
23 ; r2 unsigned char *ref_ptr
24 ; r3 int recon_stride
25 ; stack unsigned int *sse
26 ;note: in this function, sum is never used. So, we can remove this part of calculation
27 ;from vp8_variance().
29 |vp8_mse16x16_neon| PROC
30 vmov.i8 q7, #0 ;q7, q8, q9, q10 - sse
31 vmov.i8 q8, #0
32 vmov.i8 q9, #0
33 vmov.i8 q10, #0
35 mov r12, #8
37 mse16x16_neon_loop
38 vld1.8 {q0}, [r0], r1 ;Load up source and reference
39 vld1.8 {q2}, [r2], r3
40 vld1.8 {q1}, [r0], r1
41 vld1.8 {q3}, [r2], r3
43 vsubl.u8 q11, d0, d4
44 vsubl.u8 q12, d1, d5
45 vsubl.u8 q13, d2, d6
46 vsubl.u8 q14, d3, d7
48 vmlal.s16 q7, d22, d22
49 vmlal.s16 q8, d23, d23
51 subs r12, r12, #1
53 vmlal.s16 q9, d24, d24
54 vmlal.s16 q10, d25, d25
55 vmlal.s16 q7, d26, d26
56 vmlal.s16 q8, d27, d27
57 vmlal.s16 q9, d28, d28
58 vmlal.s16 q10, d29, d29
60 bne mse16x16_neon_loop
62 vadd.u32 q7, q7, q8
63 vadd.u32 q9, q9, q10
65 ldr r12, [sp] ;load *sse from stack
67 vadd.u32 q10, q7, q9
68 vpaddl.u32 q1, q10
69 vadd.u64 d0, d2, d3
71 vst1.32 {d0[0]}, [r12]
72 vmov.32 r0, d0[0]
74 bx lr
76 ENDP
79 ;=============================
80 ; r0 unsigned char *src_ptr,
81 ; r1 int source_stride,
82 ; r2 unsigned char *ref_ptr,
83 ; r3 int recon_stride
84 |vp8_get4x4sse_cs_neon| PROC
85 vld1.8 {d0}, [r0], r1 ;Load up source and reference
86 vld1.8 {d4}, [r2], r3
87 vld1.8 {d1}, [r0], r1
88 vld1.8 {d5}, [r2], r3
89 vld1.8 {d2}, [r0], r1
90 vld1.8 {d6}, [r2], r3
91 vld1.8 {d3}, [r0], r1
92 vld1.8 {d7}, [r2], r3
94 vsubl.u8 q11, d0, d4
95 vsubl.u8 q12, d1, d5
96 vsubl.u8 q13, d2, d6
97 vsubl.u8 q14, d3, d7
99 vmull.s16 q7, d22, d22
100 vmull.s16 q8, d24, d24
101 vmull.s16 q9, d26, d26
102 vmull.s16 q10, d28, d28
104 vadd.u32 q7, q7, q8
105 vadd.u32 q9, q9, q10
106 vadd.u32 q9, q7, q9
108 vpaddl.u32 q1, q9
109 vadd.u64 d0, d2, d3
111 vmov.32 r0, d0[0]
112 bx lr
114 ENDP
116 END