|
1 ; |
|
2 ; Copyright (c) 2011 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 |
|
12 EXPORT |vp8_mse16x16_armv6| |
|
13 |
|
14 ARM |
|
15 |
|
16 AREA ||.text||, CODE, READONLY, ALIGN=2 |
|
17 |
|
18 ; r0 unsigned char *src_ptr |
|
19 ; r1 int source_stride |
|
20 ; r2 unsigned char *ref_ptr |
|
21 ; r3 int recon_stride |
|
22 ; stack unsigned int *sse |
|
23 ; |
|
24 ;note: Based on vp8_variance16x16_armv6. In this function, sum is never used. |
|
25 ; So, we can remove this part of calculation. |
|
26 |
|
27 |vp8_mse16x16_armv6| PROC |
|
28 |
|
29 push {r4-r9, lr} |
|
30 |
|
31 pld [r0, r1, lsl #0] |
|
32 pld [r2, r3, lsl #0] |
|
33 |
|
34 mov r12, #16 ; set loop counter to 16 (=block height) |
|
35 mov r4, #0 ; initialize sse = 0 |
|
36 |
|
37 loop |
|
38 ; 1st 4 pixels |
|
39 ldr r5, [r0, #0x0] ; load 4 src pixels |
|
40 ldr r6, [r2, #0x0] ; load 4 ref pixels |
|
41 |
|
42 mov lr, #0 ; constant zero |
|
43 |
|
44 usub8 r8, r5, r6 ; calculate difference |
|
45 pld [r0, r1, lsl #1] |
|
46 sel r7, r8, lr ; select bytes with positive difference |
|
47 usub8 r9, r6, r5 ; calculate difference with reversed operands |
|
48 pld [r2, r3, lsl #1] |
|
49 sel r8, r9, lr ; select bytes with negative difference |
|
50 |
|
51 ; calculate partial sums |
|
52 usad8 r5, r7, lr ; calculate sum of positive differences |
|
53 usad8 r6, r8, lr ; calculate sum of negative differences |
|
54 orr r8, r8, r7 ; differences of all 4 pixels |
|
55 |
|
56 ldr r5, [r0, #0x4] ; load 4 src pixels |
|
57 |
|
58 ; calculate sse |
|
59 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
|
60 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
|
61 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
|
62 |
|
63 ; 2nd 4 pixels |
|
64 ldr r6, [r2, #0x4] ; load 4 ref pixels |
|
65 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
|
66 |
|
67 usub8 r8, r5, r6 ; calculate difference |
|
68 sel r7, r8, lr ; select bytes with positive difference |
|
69 usub8 r9, r6, r5 ; calculate difference with reversed operands |
|
70 sel r8, r9, lr ; select bytes with negative difference |
|
71 |
|
72 ; calculate partial sums |
|
73 usad8 r5, r7, lr ; calculate sum of positive differences |
|
74 usad8 r6, r8, lr ; calculate sum of negative differences |
|
75 orr r8, r8, r7 ; differences of all 4 pixels |
|
76 ldr r5, [r0, #0x8] ; load 4 src pixels |
|
77 ; calculate sse |
|
78 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
|
79 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
|
80 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
|
81 |
|
82 ; 3rd 4 pixels |
|
83 ldr r6, [r2, #0x8] ; load 4 ref pixels |
|
84 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
|
85 |
|
86 usub8 r8, r5, r6 ; calculate difference |
|
87 sel r7, r8, lr ; select bytes with positive difference |
|
88 usub8 r9, r6, r5 ; calculate difference with reversed operands |
|
89 sel r8, r9, lr ; select bytes with negative difference |
|
90 |
|
91 ; calculate partial sums |
|
92 usad8 r5, r7, lr ; calculate sum of positive differences |
|
93 usad8 r6, r8, lr ; calculate sum of negative differences |
|
94 orr r8, r8, r7 ; differences of all 4 pixels |
|
95 |
|
96 ldr r5, [r0, #0xc] ; load 4 src pixels |
|
97 |
|
98 ; calculate sse |
|
99 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
|
100 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
|
101 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
|
102 |
|
103 ; 4th 4 pixels |
|
104 ldr r6, [r2, #0xc] ; load 4 ref pixels |
|
105 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
|
106 |
|
107 usub8 r8, r5, r6 ; calculate difference |
|
108 add r0, r0, r1 ; set src_ptr to next row |
|
109 sel r7, r8, lr ; select bytes with positive difference |
|
110 usub8 r9, r6, r5 ; calculate difference with reversed operands |
|
111 add r2, r2, r3 ; set dst_ptr to next row |
|
112 sel r8, r9, lr ; select bytes with negative difference |
|
113 |
|
114 ; calculate partial sums |
|
115 usad8 r5, r7, lr ; calculate sum of positive differences |
|
116 usad8 r6, r8, lr ; calculate sum of negative differences |
|
117 orr r8, r8, r7 ; differences of all 4 pixels |
|
118 |
|
119 subs r12, r12, #1 ; next row |
|
120 |
|
121 ; calculate sse |
|
122 uxtb16 r6, r8 ; byte (two pixels) to halfwords |
|
123 uxtb16 r7, r8, ror #8 ; another two pixels to halfwords |
|
124 smlad r4, r6, r6, r4 ; dual signed multiply, add and accumulate (1) |
|
125 smlad r4, r7, r7, r4 ; dual signed multiply, add and accumulate (2) |
|
126 |
|
127 bne loop |
|
128 |
|
129 ; return stuff |
|
130 ldr r1, [sp, #28] ; get address of sse |
|
131 mov r0, r4 ; return sse |
|
132 str r4, [r1] ; store sse |
|
133 |
|
134 pop {r4-r9, pc} |
|
135 |
|
136 ENDP |
|
137 |
|
138 END |