|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "third_party/x86inc/x86inc.asm" |
|
12 |
|
13 SECTION .text |
|
14 |
|
15 %macro SAD_FN 4 |
|
16 %if %4 == 0 |
|
17 %if %3 == 5 |
|
18 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, n_rows |
|
19 %else ; %3 == 7 |
|
20 cglobal sad%1x%2, 4, %3, 5, src, src_stride, ref, ref_stride, \ |
|
21 src_stride3, ref_stride3, n_rows |
|
22 %endif ; %3 == 5/7 |
|
23 %else ; avg |
|
24 %if %3 == 5 |
|
25 cglobal sad%1x%2_avg, 5, 1 + %3, 5, src, src_stride, ref, ref_stride, \ |
|
26 second_pred, n_rows |
|
27 %else ; %3 == 7 |
|
28 cglobal sad%1x%2_avg, 5, ARCH_X86_64 + %3, 5, src, src_stride, \ |
|
29 ref, ref_stride, \ |
|
30 second_pred, \ |
|
31 src_stride3, ref_stride3 |
|
32 %if ARCH_X86_64 |
|
33 %define n_rowsd r7d |
|
34 %else ; x86-32 |
|
35 %define n_rowsd dword r0m |
|
36 %endif ; x86-32/64 |
|
37 %endif ; %3 == 5/7 |
|
38 %endif ; avg/sad |
|
39 movsxdifnidn src_strideq, src_strided |
|
40 movsxdifnidn ref_strideq, ref_strided |
|
41 %if %3 == 7 |
|
42 lea src_stride3q, [src_strideq*3] |
|
43 lea ref_stride3q, [ref_strideq*3] |
|
44 %endif ; %3 == 7 |
|
45 %endmacro |
|
46 |
|
47 ; unsigned int vp9_sad64x64_sse2(uint8_t *src, int src_stride, |
|
48 ; uint8_t *ref, int ref_stride); |
|
49 %macro SAD64XN 1-2 0 |
|
50 SAD_FN 64, %1, 5, %2 |
|
51 mov n_rowsd, %1 |
|
52 pxor m0, m0 |
|
53 .loop: |
|
54 movu m1, [refq] |
|
55 movu m2, [refq+16] |
|
56 movu m3, [refq+32] |
|
57 movu m4, [refq+48] |
|
58 %if %2 == 1 |
|
59 pavgb m1, [second_predq+mmsize*0] |
|
60 pavgb m2, [second_predq+mmsize*1] |
|
61 pavgb m3, [second_predq+mmsize*2] |
|
62 pavgb m4, [second_predq+mmsize*3] |
|
63 lea second_predq, [second_predq+mmsize*4] |
|
64 %endif |
|
65 psadbw m1, [srcq] |
|
66 psadbw m2, [srcq+16] |
|
67 psadbw m3, [srcq+32] |
|
68 psadbw m4, [srcq+48] |
|
69 paddd m1, m2 |
|
70 paddd m3, m4 |
|
71 add refq, ref_strideq |
|
72 paddd m0, m1 |
|
73 add srcq, src_strideq |
|
74 paddd m0, m3 |
|
75 dec n_rowsd |
|
76 jg .loop |
|
77 |
|
78 movhlps m1, m0 |
|
79 paddd m0, m1 |
|
80 movd eax, m0 |
|
81 RET |
|
82 %endmacro |
|
83 |
|
84 INIT_XMM sse2 |
|
85 SAD64XN 64 ; sad64x64_sse2 |
|
86 SAD64XN 32 ; sad64x32_sse2 |
|
87 SAD64XN 64, 1 ; sad64x64_avg_sse2 |
|
88 SAD64XN 32, 1 ; sad64x32_avg_sse2 |
|
89 |
|
90 ; unsigned int vp9_sad32x32_sse2(uint8_t *src, int src_stride, |
|
91 ; uint8_t *ref, int ref_stride); |
|
92 %macro SAD32XN 1-2 0 |
|
93 SAD_FN 32, %1, 5, %2 |
|
94 mov n_rowsd, %1/2 |
|
95 pxor m0, m0 |
|
96 .loop: |
|
97 movu m1, [refq] |
|
98 movu m2, [refq+16] |
|
99 movu m3, [refq+ref_strideq] |
|
100 movu m4, [refq+ref_strideq+16] |
|
101 %if %2 == 1 |
|
102 pavgb m1, [second_predq+mmsize*0] |
|
103 pavgb m2, [second_predq+mmsize*1] |
|
104 pavgb m3, [second_predq+mmsize*2] |
|
105 pavgb m4, [second_predq+mmsize*3] |
|
106 lea second_predq, [second_predq+mmsize*4] |
|
107 %endif |
|
108 psadbw m1, [srcq] |
|
109 psadbw m2, [srcq+16] |
|
110 psadbw m3, [srcq+src_strideq] |
|
111 psadbw m4, [srcq+src_strideq+16] |
|
112 paddd m1, m2 |
|
113 paddd m3, m4 |
|
114 lea refq, [refq+ref_strideq*2] |
|
115 paddd m0, m1 |
|
116 lea srcq, [srcq+src_strideq*2] |
|
117 paddd m0, m3 |
|
118 dec n_rowsd |
|
119 jg .loop |
|
120 |
|
121 movhlps m1, m0 |
|
122 paddd m0, m1 |
|
123 movd eax, m0 |
|
124 RET |
|
125 %endmacro |
|
126 |
|
127 INIT_XMM sse2 |
|
128 SAD32XN 64 ; sad32x64_sse2 |
|
129 SAD32XN 32 ; sad32x32_sse2 |
|
130 SAD32XN 16 ; sad32x16_sse2 |
|
131 SAD32XN 64, 1 ; sad32x64_avg_sse2 |
|
132 SAD32XN 32, 1 ; sad32x32_avg_sse2 |
|
133 SAD32XN 16, 1 ; sad32x16_avg_sse2 |
|
134 |
|
135 ; unsigned int vp9_sad16x{8,16}_sse2(uint8_t *src, int src_stride, |
|
136 ; uint8_t *ref, int ref_stride); |
|
137 %macro SAD16XN 1-2 0 |
|
138 SAD_FN 16, %1, 7, %2 |
|
139 mov n_rowsd, %1/4 |
|
140 pxor m0, m0 |
|
141 |
|
142 .loop: |
|
143 movu m1, [refq] |
|
144 movu m2, [refq+ref_strideq] |
|
145 movu m3, [refq+ref_strideq*2] |
|
146 movu m4, [refq+ref_stride3q] |
|
147 %if %2 == 1 |
|
148 pavgb m1, [second_predq+mmsize*0] |
|
149 pavgb m2, [second_predq+mmsize*1] |
|
150 pavgb m3, [second_predq+mmsize*2] |
|
151 pavgb m4, [second_predq+mmsize*3] |
|
152 lea second_predq, [second_predq+mmsize*4] |
|
153 %endif |
|
154 psadbw m1, [srcq] |
|
155 psadbw m2, [srcq+src_strideq] |
|
156 psadbw m3, [srcq+src_strideq*2] |
|
157 psadbw m4, [srcq+src_stride3q] |
|
158 paddd m1, m2 |
|
159 paddd m3, m4 |
|
160 lea refq, [refq+ref_strideq*4] |
|
161 paddd m0, m1 |
|
162 lea srcq, [srcq+src_strideq*4] |
|
163 paddd m0, m3 |
|
164 dec n_rowsd |
|
165 jg .loop |
|
166 |
|
167 movhlps m1, m0 |
|
168 paddd m0, m1 |
|
169 movd eax, m0 |
|
170 RET |
|
171 %endmacro |
|
172 |
|
173 INIT_XMM sse2 |
|
174 SAD16XN 32 ; sad16x32_sse2 |
|
175 SAD16XN 16 ; sad16x16_sse2 |
|
176 SAD16XN 8 ; sad16x8_sse2 |
|
177 SAD16XN 32, 1 ; sad16x32_avg_sse2 |
|
178 SAD16XN 16, 1 ; sad16x16_avg_sse2 |
|
179 SAD16XN 8, 1 ; sad16x8_avg_sse2 |
|
180 |
|
181 ; unsigned int vp9_sad8x{8,16}_sse2(uint8_t *src, int src_stride, |
|
182 ; uint8_t *ref, int ref_stride); |
|
183 %macro SAD8XN 1-2 0 |
|
184 SAD_FN 8, %1, 7, %2 |
|
185 mov n_rowsd, %1/4 |
|
186 pxor m0, m0 |
|
187 |
|
188 .loop: |
|
189 movh m1, [refq] |
|
190 movhps m1, [refq+ref_strideq] |
|
191 movh m2, [refq+ref_strideq*2] |
|
192 movhps m2, [refq+ref_stride3q] |
|
193 %if %2 == 1 |
|
194 pavgb m1, [second_predq+mmsize*0] |
|
195 pavgb m2, [second_predq+mmsize*1] |
|
196 lea second_predq, [second_predq+mmsize*2] |
|
197 %endif |
|
198 movh m3, [srcq] |
|
199 movhps m3, [srcq+src_strideq] |
|
200 movh m4, [srcq+src_strideq*2] |
|
201 movhps m4, [srcq+src_stride3q] |
|
202 psadbw m1, m3 |
|
203 psadbw m2, m4 |
|
204 lea refq, [refq+ref_strideq*4] |
|
205 paddd m0, m1 |
|
206 lea srcq, [srcq+src_strideq*4] |
|
207 paddd m0, m2 |
|
208 dec n_rowsd |
|
209 jg .loop |
|
210 |
|
211 movhlps m1, m0 |
|
212 paddd m0, m1 |
|
213 movd eax, m0 |
|
214 RET |
|
215 %endmacro |
|
216 |
|
217 INIT_XMM sse2 |
|
218 SAD8XN 16 ; sad8x16_sse2 |
|
219 SAD8XN 8 ; sad8x8_sse2 |
|
220 SAD8XN 4 ; sad8x4_sse2 |
|
221 SAD8XN 16, 1 ; sad8x16_avg_sse2 |
|
222 SAD8XN 8, 1 ; sad8x8_avg_sse2 |
|
223 SAD8XN 4, 1 ; sad8x4_avg_sse2 |
|
224 |
|
225 ; unsigned int vp9_sad4x{4, 8}_sse(uint8_t *src, int src_stride, |
|
226 ; uint8_t *ref, int ref_stride); |
|
227 %macro SAD4XN 1-2 0 |
|
228 SAD_FN 4, %1, 7, %2 |
|
229 mov n_rowsd, %1/4 |
|
230 pxor m0, m0 |
|
231 |
|
232 .loop: |
|
233 movd m1, [refq] |
|
234 movd m2, [refq+ref_strideq] |
|
235 movd m3, [refq+ref_strideq*2] |
|
236 movd m4, [refq+ref_stride3q] |
|
237 punpckldq m1, m2 |
|
238 punpckldq m3, m4 |
|
239 %if %2 == 1 |
|
240 pavgb m1, [second_predq+mmsize*0] |
|
241 pavgb m3, [second_predq+mmsize*1] |
|
242 lea second_predq, [second_predq+mmsize*2] |
|
243 %endif |
|
244 movd m2, [srcq] |
|
245 movd m5, [srcq+src_strideq] |
|
246 movd m4, [srcq+src_strideq*2] |
|
247 movd m6, [srcq+src_stride3q] |
|
248 punpckldq m2, m5 |
|
249 punpckldq m4, m6 |
|
250 psadbw m1, m2 |
|
251 psadbw m3, m4 |
|
252 lea refq, [refq+ref_strideq*4] |
|
253 paddd m0, m1 |
|
254 lea srcq, [srcq+src_strideq*4] |
|
255 paddd m0, m3 |
|
256 dec n_rowsd |
|
257 jg .loop |
|
258 |
|
259 movd eax, m0 |
|
260 RET |
|
261 %endmacro |
|
262 |
|
263 INIT_MMX sse |
|
264 SAD4XN 8 ; sad4x8_sse |
|
265 SAD4XN 4 ; sad4x4_sse |
|
266 SAD4XN 8, 1 ; sad4x8_avg_sse |
|
267 SAD4XN 4, 1 ; sad4x4_avg_sse |