|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "vpx_ports/x86_abi_support.asm" |
|
12 |
|
13 %macro STACK_FRAME_CREATE_X3 0 |
|
14 %if ABI_IS_32BIT |
|
15 %define src_ptr rsi |
|
16 %define src_stride rax |
|
17 %define ref_ptr rdi |
|
18 %define ref_stride rdx |
|
19 %define end_ptr rcx |
|
20 %define ret_var rbx |
|
21 %define result_ptr arg(4) |
|
22 %define max_err arg(4) |
|
23 %define height dword ptr arg(4) |
|
24 push rbp |
|
25 mov rbp, rsp |
|
26 push rsi |
|
27 push rdi |
|
28 push rbx |
|
29 |
|
30 mov rsi, arg(0) ; src_ptr |
|
31 mov rdi, arg(2) ; ref_ptr |
|
32 |
|
33 movsxd rax, dword ptr arg(1) ; src_stride |
|
34 movsxd rdx, dword ptr arg(3) ; ref_stride |
|
35 %else |
|
36 %if LIBVPX_YASM_WIN64 |
|
37 SAVE_XMM 7, u |
|
38 %define src_ptr rcx |
|
39 %define src_stride rdx |
|
40 %define ref_ptr r8 |
|
41 %define ref_stride r9 |
|
42 %define end_ptr r10 |
|
43 %define ret_var r11 |
|
44 %define result_ptr [rsp+xmm_stack_space+8+4*8] |
|
45 %define max_err [rsp+xmm_stack_space+8+4*8] |
|
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] |
|
47 %else |
|
48 %define src_ptr rdi |
|
49 %define src_stride rsi |
|
50 %define ref_ptr rdx |
|
51 %define ref_stride rcx |
|
52 %define end_ptr r9 |
|
53 %define ret_var r10 |
|
54 %define result_ptr r8 |
|
55 %define max_err r8 |
|
56 %define height r8 |
|
57 %endif |
|
58 %endif |
|
59 |
|
60 %endmacro |
|
61 |
|
62 %macro STACK_FRAME_DESTROY_X3 0 |
|
63 %define src_ptr |
|
64 %define src_stride |
|
65 %define ref_ptr |
|
66 %define ref_stride |
|
67 %define end_ptr |
|
68 %define ret_var |
|
69 %define result_ptr |
|
70 %define max_err |
|
71 %define height |
|
72 |
|
73 %if ABI_IS_32BIT |
|
74 pop rbx |
|
75 pop rdi |
|
76 pop rsi |
|
77 pop rbp |
|
78 %else |
|
79 %if LIBVPX_YASM_WIN64 |
|
80 RESTORE_XMM |
|
81 %endif |
|
82 %endif |
|
83 ret |
|
84 %endmacro |
|
85 |
|
86 %macro PROCESS_16X2X3 5 |
|
87 %if %1==0 |
|
88 movdqa xmm0, XMMWORD PTR [%2] |
|
89 lddqu xmm5, XMMWORD PTR [%3] |
|
90 lddqu xmm6, XMMWORD PTR [%3+1] |
|
91 lddqu xmm7, XMMWORD PTR [%3+2] |
|
92 |
|
93 psadbw xmm5, xmm0 |
|
94 psadbw xmm6, xmm0 |
|
95 psadbw xmm7, xmm0 |
|
96 %else |
|
97 movdqa xmm0, XMMWORD PTR [%2] |
|
98 lddqu xmm1, XMMWORD PTR [%3] |
|
99 lddqu xmm2, XMMWORD PTR [%3+1] |
|
100 lddqu xmm3, XMMWORD PTR [%3+2] |
|
101 |
|
102 psadbw xmm1, xmm0 |
|
103 psadbw xmm2, xmm0 |
|
104 psadbw xmm3, xmm0 |
|
105 |
|
106 paddw xmm5, xmm1 |
|
107 paddw xmm6, xmm2 |
|
108 paddw xmm7, xmm3 |
|
109 %endif |
|
110 movdqa xmm0, XMMWORD PTR [%2+%4] |
|
111 lddqu xmm1, XMMWORD PTR [%3+%5] |
|
112 lddqu xmm2, XMMWORD PTR [%3+%5+1] |
|
113 lddqu xmm3, XMMWORD PTR [%3+%5+2] |
|
114 |
|
115 %if %1==0 || %1==1 |
|
116 lea %2, [%2+%4*2] |
|
117 lea %3, [%3+%5*2] |
|
118 %endif |
|
119 |
|
120 psadbw xmm1, xmm0 |
|
121 psadbw xmm2, xmm0 |
|
122 psadbw xmm3, xmm0 |
|
123 |
|
124 paddw xmm5, xmm1 |
|
125 paddw xmm6, xmm2 |
|
126 paddw xmm7, xmm3 |
|
127 %endmacro |
|
128 |
|
129 %macro PROCESS_8X2X3 5 |
|
130 %if %1==0 |
|
131 movq mm0, QWORD PTR [%2] |
|
132 movq mm5, QWORD PTR [%3] |
|
133 movq mm6, QWORD PTR [%3+1] |
|
134 movq mm7, QWORD PTR [%3+2] |
|
135 |
|
136 psadbw mm5, mm0 |
|
137 psadbw mm6, mm0 |
|
138 psadbw mm7, mm0 |
|
139 %else |
|
140 movq mm0, QWORD PTR [%2] |
|
141 movq mm1, QWORD PTR [%3] |
|
142 movq mm2, QWORD PTR [%3+1] |
|
143 movq mm3, QWORD PTR [%3+2] |
|
144 |
|
145 psadbw mm1, mm0 |
|
146 psadbw mm2, mm0 |
|
147 psadbw mm3, mm0 |
|
148 |
|
149 paddw mm5, mm1 |
|
150 paddw mm6, mm2 |
|
151 paddw mm7, mm3 |
|
152 %endif |
|
153 movq mm0, QWORD PTR [%2+%4] |
|
154 movq mm1, QWORD PTR [%3+%5] |
|
155 movq mm2, QWORD PTR [%3+%5+1] |
|
156 movq mm3, QWORD PTR [%3+%5+2] |
|
157 |
|
158 %if %1==0 || %1==1 |
|
159 lea %2, [%2+%4*2] |
|
160 lea %3, [%3+%5*2] |
|
161 %endif |
|
162 |
|
163 psadbw mm1, mm0 |
|
164 psadbw mm2, mm0 |
|
165 psadbw mm3, mm0 |
|
166 |
|
167 paddw mm5, mm1 |
|
168 paddw mm6, mm2 |
|
169 paddw mm7, mm3 |
|
170 %endmacro |
|
171 |
|
172 ;void int vp9_sad16x16x3_sse3( |
|
173 ; unsigned char *src_ptr, |
|
174 ; int src_stride, |
|
175 ; unsigned char *ref_ptr, |
|
176 ; int ref_stride, |
|
177 ; int *results) |
|
178 global sym(vp9_sad16x16x3_sse3) PRIVATE |
|
179 sym(vp9_sad16x16x3_sse3): |
|
180 |
|
181 STACK_FRAME_CREATE_X3 |
|
182 |
|
183 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
184 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
185 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
186 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
187 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
188 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
189 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
190 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
191 |
|
192 mov rcx, result_ptr |
|
193 |
|
194 movq xmm0, xmm5 |
|
195 psrldq xmm5, 8 |
|
196 |
|
197 paddw xmm0, xmm5 |
|
198 movd [rcx], xmm0 |
|
199 ;- |
|
200 movq xmm0, xmm6 |
|
201 psrldq xmm6, 8 |
|
202 |
|
203 paddw xmm0, xmm6 |
|
204 movd [rcx+4], xmm0 |
|
205 ;- |
|
206 movq xmm0, xmm7 |
|
207 psrldq xmm7, 8 |
|
208 |
|
209 paddw xmm0, xmm7 |
|
210 movd [rcx+8], xmm0 |
|
211 |
|
212 STACK_FRAME_DESTROY_X3 |
|
213 |
|
214 ;void int vp9_sad16x8x3_sse3( |
|
215 ; unsigned char *src_ptr, |
|
216 ; int src_stride, |
|
217 ; unsigned char *ref_ptr, |
|
218 ; int ref_stride, |
|
219 ; int *results) |
|
220 global sym(vp9_sad16x8x3_sse3) PRIVATE |
|
221 sym(vp9_sad16x8x3_sse3): |
|
222 |
|
223 STACK_FRAME_CREATE_X3 |
|
224 |
|
225 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
226 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
227 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
228 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
229 |
|
230 mov rcx, result_ptr |
|
231 |
|
232 movq xmm0, xmm5 |
|
233 psrldq xmm5, 8 |
|
234 |
|
235 paddw xmm0, xmm5 |
|
236 movd [rcx], xmm0 |
|
237 ;- |
|
238 movq xmm0, xmm6 |
|
239 psrldq xmm6, 8 |
|
240 |
|
241 paddw xmm0, xmm6 |
|
242 movd [rcx+4], xmm0 |
|
243 ;- |
|
244 movq xmm0, xmm7 |
|
245 psrldq xmm7, 8 |
|
246 |
|
247 paddw xmm0, xmm7 |
|
248 movd [rcx+8], xmm0 |
|
249 |
|
250 STACK_FRAME_DESTROY_X3 |
|
251 |
|
252 ;void int vp9_sad8x16x3_sse3( |
|
253 ; unsigned char *src_ptr, |
|
254 ; int src_stride, |
|
255 ; unsigned char *ref_ptr, |
|
256 ; int ref_stride, |
|
257 ; int *results) |
|
258 global sym(vp9_sad8x16x3_sse3) PRIVATE |
|
259 sym(vp9_sad8x16x3_sse3): |
|
260 |
|
261 STACK_FRAME_CREATE_X3 |
|
262 |
|
263 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
264 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
265 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
266 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
267 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
268 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
269 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
270 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
271 |
|
272 mov rcx, result_ptr |
|
273 |
|
274 punpckldq mm5, mm6 |
|
275 |
|
276 movq [rcx], mm5 |
|
277 movd [rcx+8], mm7 |
|
278 |
|
279 STACK_FRAME_DESTROY_X3 |
|
280 |
|
281 ;void int vp9_sad8x8x3_sse3( |
|
282 ; unsigned char *src_ptr, |
|
283 ; int src_stride, |
|
284 ; unsigned char *ref_ptr, |
|
285 ; int ref_stride, |
|
286 ; int *results) |
|
287 global sym(vp9_sad8x8x3_sse3) PRIVATE |
|
288 sym(vp9_sad8x8x3_sse3): |
|
289 |
|
290 STACK_FRAME_CREATE_X3 |
|
291 |
|
292 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
293 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
294 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
295 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
296 |
|
297 mov rcx, result_ptr |
|
298 |
|
299 punpckldq mm5, mm6 |
|
300 |
|
301 movq [rcx], mm5 |
|
302 movd [rcx+8], mm7 |
|
303 |
|
304 STACK_FRAME_DESTROY_X3 |
|
305 |
|
306 ;void int vp9_sad4x4x3_sse3( |
|
307 ; unsigned char *src_ptr, |
|
308 ; int src_stride, |
|
309 ; unsigned char *ref_ptr, |
|
310 ; int ref_stride, |
|
311 ; int *results) |
|
312 global sym(vp9_sad4x4x3_sse3) PRIVATE |
|
313 sym(vp9_sad4x4x3_sse3): |
|
314 |
|
315 STACK_FRAME_CREATE_X3 |
|
316 |
|
317 movd mm0, DWORD PTR [src_ptr] |
|
318 movd mm1, DWORD PTR [ref_ptr] |
|
319 |
|
320 movd mm2, DWORD PTR [src_ptr+src_stride] |
|
321 movd mm3, DWORD PTR [ref_ptr+ref_stride] |
|
322 |
|
323 punpcklbw mm0, mm2 |
|
324 punpcklbw mm1, mm3 |
|
325 |
|
326 movd mm4, DWORD PTR [ref_ptr+1] |
|
327 movd mm5, DWORD PTR [ref_ptr+2] |
|
328 |
|
329 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
|
330 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] |
|
331 |
|
332 psadbw mm1, mm0 |
|
333 |
|
334 punpcklbw mm4, mm2 |
|
335 punpcklbw mm5, mm3 |
|
336 |
|
337 psadbw mm4, mm0 |
|
338 psadbw mm5, mm0 |
|
339 |
|
340 lea src_ptr, [src_ptr+src_stride*2] |
|
341 lea ref_ptr, [ref_ptr+ref_stride*2] |
|
342 |
|
343 movd mm0, DWORD PTR [src_ptr] |
|
344 movd mm2, DWORD PTR [ref_ptr] |
|
345 |
|
346 movd mm3, DWORD PTR [src_ptr+src_stride] |
|
347 movd mm6, DWORD PTR [ref_ptr+ref_stride] |
|
348 |
|
349 punpcklbw mm0, mm3 |
|
350 punpcklbw mm2, mm6 |
|
351 |
|
352 movd mm3, DWORD PTR [ref_ptr+1] |
|
353 movd mm7, DWORD PTR [ref_ptr+2] |
|
354 |
|
355 psadbw mm2, mm0 |
|
356 |
|
357 paddw mm1, mm2 |
|
358 |
|
359 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
|
360 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] |
|
361 |
|
362 punpcklbw mm3, mm2 |
|
363 punpcklbw mm7, mm6 |
|
364 |
|
365 psadbw mm3, mm0 |
|
366 psadbw mm7, mm0 |
|
367 |
|
368 paddw mm3, mm4 |
|
369 paddw mm7, mm5 |
|
370 |
|
371 mov rcx, result_ptr |
|
372 |
|
373 punpckldq mm1, mm3 |
|
374 |
|
375 movq [rcx], mm1 |
|
376 movd [rcx+8], mm7 |
|
377 |
|
378 STACK_FRAME_DESTROY_X3 |