|
1 ; |
|
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. |
|
3 ; |
|
4 ; Use of this source code is governed by a BSD-style license |
|
5 ; that can be found in the LICENSE file in the root of the source |
|
6 ; tree. An additional intellectual property rights grant can be found |
|
7 ; in the file PATENTS. All contributing project authors may |
|
8 ; be found in the AUTHORS file in the root of the source tree. |
|
9 ; |
|
10 |
|
11 %include "vpx_ports/x86_abi_support.asm" |
|
12 |
|
13 %macro STACK_FRAME_CREATE_X3 0 |
|
14 %if ABI_IS_32BIT |
|
15 %define src_ptr rsi |
|
16 %define src_stride rax |
|
17 %define ref_ptr rdi |
|
18 %define ref_stride rdx |
|
19 %define end_ptr rcx |
|
20 %define ret_var rbx |
|
21 %define result_ptr arg(4) |
|
22 %define max_sad arg(4) |
|
23 %define height dword ptr arg(4) |
|
24 push rbp |
|
25 mov rbp, rsp |
|
26 push rsi |
|
27 push rdi |
|
28 push rbx |
|
29 |
|
30 mov rsi, arg(0) ; src_ptr |
|
31 mov rdi, arg(2) ; ref_ptr |
|
32 |
|
33 movsxd rax, dword ptr arg(1) ; src_stride |
|
34 movsxd rdx, dword ptr arg(3) ; ref_stride |
|
35 %else |
|
36 %if LIBVPX_YASM_WIN64 |
|
37 SAVE_XMM 7, u |
|
38 %define src_ptr rcx |
|
39 %define src_stride rdx |
|
40 %define ref_ptr r8 |
|
41 %define ref_stride r9 |
|
42 %define end_ptr r10 |
|
43 %define ret_var r11 |
|
44 %define result_ptr [rsp+xmm_stack_space+8+4*8] |
|
45 %define max_sad [rsp+xmm_stack_space+8+4*8] |
|
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8] |
|
47 %else |
|
48 %define src_ptr rdi |
|
49 %define src_stride rsi |
|
50 %define ref_ptr rdx |
|
51 %define ref_stride rcx |
|
52 %define end_ptr r9 |
|
53 %define ret_var r10 |
|
54 %define result_ptr r8 |
|
55 %define max_sad r8 |
|
56 %define height r8 |
|
57 %endif |
|
58 %endif |
|
59 |
|
60 %endmacro |
|
61 |
|
62 %macro STACK_FRAME_DESTROY_X3 0 |
|
63 %define src_ptr |
|
64 %define src_stride |
|
65 %define ref_ptr |
|
66 %define ref_stride |
|
67 %define end_ptr |
|
68 %define ret_var |
|
69 %define result_ptr |
|
70 %define max_sad |
|
71 %define height |
|
72 |
|
73 %if ABI_IS_32BIT |
|
74 pop rbx |
|
75 pop rdi |
|
76 pop rsi |
|
77 pop rbp |
|
78 %else |
|
79 %if LIBVPX_YASM_WIN64 |
|
80 RESTORE_XMM |
|
81 %endif |
|
82 %endif |
|
83 ret |
|
84 %endmacro |
|
85 |
|
86 %macro STACK_FRAME_CREATE_X4 0 |
|
87 %if ABI_IS_32BIT |
|
88 %define src_ptr rsi |
|
89 %define src_stride rax |
|
90 %define r0_ptr rcx |
|
91 %define r1_ptr rdx |
|
92 %define r2_ptr rbx |
|
93 %define r3_ptr rdi |
|
94 %define ref_stride rbp |
|
95 %define result_ptr arg(4) |
|
96 push rbp |
|
97 mov rbp, rsp |
|
98 push rsi |
|
99 push rdi |
|
100 push rbx |
|
101 |
|
102 push rbp |
|
103 mov rdi, arg(2) ; ref_ptr_base |
|
104 |
|
105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi |
|
106 |
|
107 mov rsi, arg(0) ; src_ptr |
|
108 |
|
109 movsxd rbx, dword ptr arg(1) ; src_stride |
|
110 movsxd rbp, dword ptr arg(3) ; ref_stride |
|
111 |
|
112 xchg rbx, rax |
|
113 %else |
|
114 %if LIBVPX_YASM_WIN64 |
|
115 SAVE_XMM 7, u |
|
116 %define src_ptr rcx |
|
117 %define src_stride rdx |
|
118 %define r0_ptr rsi |
|
119 %define r1_ptr r10 |
|
120 %define r2_ptr r11 |
|
121 %define r3_ptr r8 |
|
122 %define ref_stride r9 |
|
123 %define result_ptr [rsp+xmm_stack_space+16+4*8] |
|
124 push rsi |
|
125 |
|
126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
|
127 %else |
|
128 %define src_ptr rdi |
|
129 %define src_stride rsi |
|
130 %define r0_ptr r9 |
|
131 %define r1_ptr r10 |
|
132 %define r2_ptr r11 |
|
133 %define r3_ptr rdx |
|
134 %define ref_stride rcx |
|
135 %define result_ptr r8 |
|
136 |
|
137 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr |
|
138 |
|
139 %endif |
|
140 %endif |
|
141 %endmacro |
|
142 |
|
143 %macro STACK_FRAME_DESTROY_X4 0 |
|
144 %define src_ptr |
|
145 %define src_stride |
|
146 %define r0_ptr |
|
147 %define r1_ptr |
|
148 %define r2_ptr |
|
149 %define r3_ptr |
|
150 %define ref_stride |
|
151 %define result_ptr |
|
152 |
|
153 %if ABI_IS_32BIT |
|
154 pop rbx |
|
155 pop rdi |
|
156 pop rsi |
|
157 pop rbp |
|
158 %else |
|
159 %if LIBVPX_YASM_WIN64 |
|
160 pop rsi |
|
161 RESTORE_XMM |
|
162 %endif |
|
163 %endif |
|
164 ret |
|
165 %endmacro |
|
166 |
|
167 %macro PROCESS_16X2X3 5 |
|
168 %if %1==0 |
|
169 movdqa xmm0, XMMWORD PTR [%2] |
|
170 lddqu xmm5, XMMWORD PTR [%3] |
|
171 lddqu xmm6, XMMWORD PTR [%3+1] |
|
172 lddqu xmm7, XMMWORD PTR [%3+2] |
|
173 |
|
174 psadbw xmm5, xmm0 |
|
175 psadbw xmm6, xmm0 |
|
176 psadbw xmm7, xmm0 |
|
177 %else |
|
178 movdqa xmm0, XMMWORD PTR [%2] |
|
179 lddqu xmm1, XMMWORD PTR [%3] |
|
180 lddqu xmm2, XMMWORD PTR [%3+1] |
|
181 lddqu xmm3, XMMWORD PTR [%3+2] |
|
182 |
|
183 psadbw xmm1, xmm0 |
|
184 psadbw xmm2, xmm0 |
|
185 psadbw xmm3, xmm0 |
|
186 |
|
187 paddw xmm5, xmm1 |
|
188 paddw xmm6, xmm2 |
|
189 paddw xmm7, xmm3 |
|
190 %endif |
|
191 movdqa xmm0, XMMWORD PTR [%2+%4] |
|
192 lddqu xmm1, XMMWORD PTR [%3+%5] |
|
193 lddqu xmm2, XMMWORD PTR [%3+%5+1] |
|
194 lddqu xmm3, XMMWORD PTR [%3+%5+2] |
|
195 |
|
196 %if %1==0 || %1==1 |
|
197 lea %2, [%2+%4*2] |
|
198 lea %3, [%3+%5*2] |
|
199 %endif |
|
200 |
|
201 psadbw xmm1, xmm0 |
|
202 psadbw xmm2, xmm0 |
|
203 psadbw xmm3, xmm0 |
|
204 |
|
205 paddw xmm5, xmm1 |
|
206 paddw xmm6, xmm2 |
|
207 paddw xmm7, xmm3 |
|
208 %endmacro |
|
209 |
|
210 %macro PROCESS_8X2X3 5 |
|
211 %if %1==0 |
|
212 movq mm0, QWORD PTR [%2] |
|
213 movq mm5, QWORD PTR [%3] |
|
214 movq mm6, QWORD PTR [%3+1] |
|
215 movq mm7, QWORD PTR [%3+2] |
|
216 |
|
217 psadbw mm5, mm0 |
|
218 psadbw mm6, mm0 |
|
219 psadbw mm7, mm0 |
|
220 %else |
|
221 movq mm0, QWORD PTR [%2] |
|
222 movq mm1, QWORD PTR [%3] |
|
223 movq mm2, QWORD PTR [%3+1] |
|
224 movq mm3, QWORD PTR [%3+2] |
|
225 |
|
226 psadbw mm1, mm0 |
|
227 psadbw mm2, mm0 |
|
228 psadbw mm3, mm0 |
|
229 |
|
230 paddw mm5, mm1 |
|
231 paddw mm6, mm2 |
|
232 paddw mm7, mm3 |
|
233 %endif |
|
234 movq mm0, QWORD PTR [%2+%4] |
|
235 movq mm1, QWORD PTR [%3+%5] |
|
236 movq mm2, QWORD PTR [%3+%5+1] |
|
237 movq mm3, QWORD PTR [%3+%5+2] |
|
238 |
|
239 %if %1==0 || %1==1 |
|
240 lea %2, [%2+%4*2] |
|
241 lea %3, [%3+%5*2] |
|
242 %endif |
|
243 |
|
244 psadbw mm1, mm0 |
|
245 psadbw mm2, mm0 |
|
246 psadbw mm3, mm0 |
|
247 |
|
248 paddw mm5, mm1 |
|
249 paddw mm6, mm2 |
|
250 paddw mm7, mm3 |
|
251 %endmacro |
|
252 |
|
253 %macro LOAD_X4_ADDRESSES 5 |
|
254 mov %2, [%1+REG_SZ_BYTES*0] |
|
255 mov %3, [%1+REG_SZ_BYTES*1] |
|
256 |
|
257 mov %4, [%1+REG_SZ_BYTES*2] |
|
258 mov %5, [%1+REG_SZ_BYTES*3] |
|
259 %endmacro |
|
260 |
|
261 %macro PROCESS_16X2X4 8 |
|
262 %if %1==0 |
|
263 movdqa xmm0, XMMWORD PTR [%2] |
|
264 lddqu xmm4, XMMWORD PTR [%3] |
|
265 lddqu xmm5, XMMWORD PTR [%4] |
|
266 lddqu xmm6, XMMWORD PTR [%5] |
|
267 lddqu xmm7, XMMWORD PTR [%6] |
|
268 |
|
269 psadbw xmm4, xmm0 |
|
270 psadbw xmm5, xmm0 |
|
271 psadbw xmm6, xmm0 |
|
272 psadbw xmm7, xmm0 |
|
273 %else |
|
274 movdqa xmm0, XMMWORD PTR [%2] |
|
275 lddqu xmm1, XMMWORD PTR [%3] |
|
276 lddqu xmm2, XMMWORD PTR [%4] |
|
277 lddqu xmm3, XMMWORD PTR [%5] |
|
278 |
|
279 psadbw xmm1, xmm0 |
|
280 psadbw xmm2, xmm0 |
|
281 psadbw xmm3, xmm0 |
|
282 |
|
283 paddw xmm4, xmm1 |
|
284 lddqu xmm1, XMMWORD PTR [%6] |
|
285 paddw xmm5, xmm2 |
|
286 paddw xmm6, xmm3 |
|
287 |
|
288 psadbw xmm1, xmm0 |
|
289 paddw xmm7, xmm1 |
|
290 %endif |
|
291 movdqa xmm0, XMMWORD PTR [%2+%7] |
|
292 lddqu xmm1, XMMWORD PTR [%3+%8] |
|
293 lddqu xmm2, XMMWORD PTR [%4+%8] |
|
294 lddqu xmm3, XMMWORD PTR [%5+%8] |
|
295 |
|
296 psadbw xmm1, xmm0 |
|
297 psadbw xmm2, xmm0 |
|
298 psadbw xmm3, xmm0 |
|
299 |
|
300 paddw xmm4, xmm1 |
|
301 lddqu xmm1, XMMWORD PTR [%6+%8] |
|
302 paddw xmm5, xmm2 |
|
303 paddw xmm6, xmm3 |
|
304 |
|
305 %if %1==0 || %1==1 |
|
306 lea %2, [%2+%7*2] |
|
307 lea %3, [%3+%8*2] |
|
308 |
|
309 lea %4, [%4+%8*2] |
|
310 lea %5, [%5+%8*2] |
|
311 |
|
312 lea %6, [%6+%8*2] |
|
313 %endif |
|
314 psadbw xmm1, xmm0 |
|
315 paddw xmm7, xmm1 |
|
316 |
|
317 %endmacro |
|
318 |
|
319 %macro PROCESS_8X2X4 8 |
|
320 %if %1==0 |
|
321 movq mm0, QWORD PTR [%2] |
|
322 movq mm4, QWORD PTR [%3] |
|
323 movq mm5, QWORD PTR [%4] |
|
324 movq mm6, QWORD PTR [%5] |
|
325 movq mm7, QWORD PTR [%6] |
|
326 |
|
327 psadbw mm4, mm0 |
|
328 psadbw mm5, mm0 |
|
329 psadbw mm6, mm0 |
|
330 psadbw mm7, mm0 |
|
331 %else |
|
332 movq mm0, QWORD PTR [%2] |
|
333 movq mm1, QWORD PTR [%3] |
|
334 movq mm2, QWORD PTR [%4] |
|
335 movq mm3, QWORD PTR [%5] |
|
336 |
|
337 psadbw mm1, mm0 |
|
338 psadbw mm2, mm0 |
|
339 psadbw mm3, mm0 |
|
340 |
|
341 paddw mm4, mm1 |
|
342 movq mm1, QWORD PTR [%6] |
|
343 paddw mm5, mm2 |
|
344 paddw mm6, mm3 |
|
345 |
|
346 psadbw mm1, mm0 |
|
347 paddw mm7, mm1 |
|
348 %endif |
|
349 movq mm0, QWORD PTR [%2+%7] |
|
350 movq mm1, QWORD PTR [%3+%8] |
|
351 movq mm2, QWORD PTR [%4+%8] |
|
352 movq mm3, QWORD PTR [%5+%8] |
|
353 |
|
354 psadbw mm1, mm0 |
|
355 psadbw mm2, mm0 |
|
356 psadbw mm3, mm0 |
|
357 |
|
358 paddw mm4, mm1 |
|
359 movq mm1, QWORD PTR [%6+%8] |
|
360 paddw mm5, mm2 |
|
361 paddw mm6, mm3 |
|
362 |
|
363 %if %1==0 || %1==1 |
|
364 lea %2, [%2+%7*2] |
|
365 lea %3, [%3+%8*2] |
|
366 |
|
367 lea %4, [%4+%8*2] |
|
368 lea %5, [%5+%8*2] |
|
369 |
|
370 lea %6, [%6+%8*2] |
|
371 %endif |
|
372 psadbw mm1, mm0 |
|
373 paddw mm7, mm1 |
|
374 |
|
375 %endmacro |
|
376 |
|
377 ;void int vp8_sad16x16x3_sse3( |
|
378 ; unsigned char *src_ptr, |
|
379 ; int src_stride, |
|
380 ; unsigned char *ref_ptr, |
|
381 ; int ref_stride, |
|
382 ; int *results) |
|
383 global sym(vp8_sad16x16x3_sse3) PRIVATE |
|
384 sym(vp8_sad16x16x3_sse3): |
|
385 |
|
386 STACK_FRAME_CREATE_X3 |
|
387 |
|
388 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
391 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
392 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
393 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
394 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
395 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
396 |
|
397 mov rcx, result_ptr |
|
398 |
|
399 movq xmm0, xmm5 |
|
400 psrldq xmm5, 8 |
|
401 |
|
402 paddw xmm0, xmm5 |
|
403 movd [rcx], xmm0 |
|
404 ;- |
|
405 movq xmm0, xmm6 |
|
406 psrldq xmm6, 8 |
|
407 |
|
408 paddw xmm0, xmm6 |
|
409 movd [rcx+4], xmm0 |
|
410 ;- |
|
411 movq xmm0, xmm7 |
|
412 psrldq xmm7, 8 |
|
413 |
|
414 paddw xmm0, xmm7 |
|
415 movd [rcx+8], xmm0 |
|
416 |
|
417 STACK_FRAME_DESTROY_X3 |
|
418 |
|
419 ;void int vp8_sad16x8x3_sse3( |
|
420 ; unsigned char *src_ptr, |
|
421 ; int src_stride, |
|
422 ; unsigned char *ref_ptr, |
|
423 ; int ref_stride, |
|
424 ; int *results) |
|
425 global sym(vp8_sad16x8x3_sse3) PRIVATE |
|
426 sym(vp8_sad16x8x3_sse3): |
|
427 |
|
428 STACK_FRAME_CREATE_X3 |
|
429 |
|
430 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
431 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
432 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
433 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
434 |
|
435 mov rcx, result_ptr |
|
436 |
|
437 movq xmm0, xmm5 |
|
438 psrldq xmm5, 8 |
|
439 |
|
440 paddw xmm0, xmm5 |
|
441 movd [rcx], xmm0 |
|
442 ;- |
|
443 movq xmm0, xmm6 |
|
444 psrldq xmm6, 8 |
|
445 |
|
446 paddw xmm0, xmm6 |
|
447 movd [rcx+4], xmm0 |
|
448 ;- |
|
449 movq xmm0, xmm7 |
|
450 psrldq xmm7, 8 |
|
451 |
|
452 paddw xmm0, xmm7 |
|
453 movd [rcx+8], xmm0 |
|
454 |
|
455 STACK_FRAME_DESTROY_X3 |
|
456 |
|
457 ;void int vp8_sad8x16x3_sse3( |
|
458 ; unsigned char *src_ptr, |
|
459 ; int src_stride, |
|
460 ; unsigned char *ref_ptr, |
|
461 ; int ref_stride, |
|
462 ; int *results) |
|
463 global sym(vp8_sad8x16x3_sse3) PRIVATE |
|
464 sym(vp8_sad8x16x3_sse3): |
|
465 |
|
466 STACK_FRAME_CREATE_X3 |
|
467 |
|
468 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
471 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
472 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
473 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
474 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
475 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
476 |
|
477 mov rcx, result_ptr |
|
478 |
|
479 punpckldq mm5, mm6 |
|
480 |
|
481 movq [rcx], mm5 |
|
482 movd [rcx+8], mm7 |
|
483 |
|
484 STACK_FRAME_DESTROY_X3 |
|
485 |
|
486 ;void int vp8_sad8x8x3_sse3( |
|
487 ; unsigned char *src_ptr, |
|
488 ; int src_stride, |
|
489 ; unsigned char *ref_ptr, |
|
490 ; int ref_stride, |
|
491 ; int *results) |
|
492 global sym(vp8_sad8x8x3_sse3) PRIVATE |
|
493 sym(vp8_sad8x8x3_sse3): |
|
494 |
|
495 STACK_FRAME_CREATE_X3 |
|
496 |
|
497 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride |
|
498 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
499 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride |
|
500 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride |
|
501 |
|
502 mov rcx, result_ptr |
|
503 |
|
504 punpckldq mm5, mm6 |
|
505 |
|
506 movq [rcx], mm5 |
|
507 movd [rcx+8], mm7 |
|
508 |
|
509 STACK_FRAME_DESTROY_X3 |
|
510 |
|
511 ;void int vp8_sad4x4x3_sse3( |
|
512 ; unsigned char *src_ptr, |
|
513 ; int src_stride, |
|
514 ; unsigned char *ref_ptr, |
|
515 ; int ref_stride, |
|
516 ; int *results) |
|
517 global sym(vp8_sad4x4x3_sse3) PRIVATE |
|
518 sym(vp8_sad4x4x3_sse3): |
|
519 |
|
520 STACK_FRAME_CREATE_X3 |
|
521 |
|
522 movd mm0, DWORD PTR [src_ptr] |
|
523 movd mm1, DWORD PTR [ref_ptr] |
|
524 |
|
525 movd mm2, DWORD PTR [src_ptr+src_stride] |
|
526 movd mm3, DWORD PTR [ref_ptr+ref_stride] |
|
527 |
|
528 punpcklbw mm0, mm2 |
|
529 punpcklbw mm1, mm3 |
|
530 |
|
531 movd mm4, DWORD PTR [ref_ptr+1] |
|
532 movd mm5, DWORD PTR [ref_ptr+2] |
|
533 |
|
534 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
|
535 movd mm3, DWORD PTR [ref_ptr+ref_stride+2] |
|
536 |
|
537 psadbw mm1, mm0 |
|
538 |
|
539 punpcklbw mm4, mm2 |
|
540 punpcklbw mm5, mm3 |
|
541 |
|
542 psadbw mm4, mm0 |
|
543 psadbw mm5, mm0 |
|
544 |
|
545 lea src_ptr, [src_ptr+src_stride*2] |
|
546 lea ref_ptr, [ref_ptr+ref_stride*2] |
|
547 |
|
548 movd mm0, DWORD PTR [src_ptr] |
|
549 movd mm2, DWORD PTR [ref_ptr] |
|
550 |
|
551 movd mm3, DWORD PTR [src_ptr+src_stride] |
|
552 movd mm6, DWORD PTR [ref_ptr+ref_stride] |
|
553 |
|
554 punpcklbw mm0, mm3 |
|
555 punpcklbw mm2, mm6 |
|
556 |
|
557 movd mm3, DWORD PTR [ref_ptr+1] |
|
558 movd mm7, DWORD PTR [ref_ptr+2] |
|
559 |
|
560 psadbw mm2, mm0 |
|
561 |
|
562 paddw mm1, mm2 |
|
563 |
|
564 movd mm2, DWORD PTR [ref_ptr+ref_stride+1] |
|
565 movd mm6, DWORD PTR [ref_ptr+ref_stride+2] |
|
566 |
|
567 punpcklbw mm3, mm2 |
|
568 punpcklbw mm7, mm6 |
|
569 |
|
570 psadbw mm3, mm0 |
|
571 psadbw mm7, mm0 |
|
572 |
|
573 paddw mm3, mm4 |
|
574 paddw mm7, mm5 |
|
575 |
|
576 mov rcx, result_ptr |
|
577 |
|
578 punpckldq mm1, mm3 |
|
579 |
|
580 movq [rcx], mm1 |
|
581 movd [rcx+8], mm7 |
|
582 |
|
583 STACK_FRAME_DESTROY_X3 |
|
584 |
|
585 ;unsigned int vp8_sad16x16_sse3( |
|
586 ; unsigned char *src_ptr, |
|
587 ; int src_stride, |
|
588 ; unsigned char *ref_ptr, |
|
589 ; int ref_stride, |
|
590 ; int max_sad) |
|
591 ;%define lddqu movdqu |
|
592 global sym(vp8_sad16x16_sse3) PRIVATE |
|
593 sym(vp8_sad16x16_sse3): |
|
594 |
|
595 STACK_FRAME_CREATE_X3 |
|
596 |
|
597 mov end_ptr, 4 |
|
598 pxor xmm7, xmm7 |
|
599 |
|
600 .vp8_sad16x16_sse3_loop: |
|
601 movdqa xmm0, XMMWORD PTR [src_ptr] |
|
602 movdqu xmm1, XMMWORD PTR [ref_ptr] |
|
603 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] |
|
604 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] |
|
605 |
|
606 lea src_ptr, [src_ptr+src_stride*2] |
|
607 lea ref_ptr, [ref_ptr+ref_stride*2] |
|
608 |
|
609 movdqa xmm4, XMMWORD PTR [src_ptr] |
|
610 movdqu xmm5, XMMWORD PTR [ref_ptr] |
|
611 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] |
|
612 |
|
613 psadbw xmm0, xmm1 |
|
614 |
|
615 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] |
|
616 |
|
617 psadbw xmm2, xmm3 |
|
618 psadbw xmm4, xmm5 |
|
619 psadbw xmm6, xmm1 |
|
620 |
|
621 lea src_ptr, [src_ptr+src_stride*2] |
|
622 lea ref_ptr, [ref_ptr+ref_stride*2] |
|
623 |
|
624 paddw xmm7, xmm0 |
|
625 paddw xmm7, xmm2 |
|
626 paddw xmm7, xmm4 |
|
627 paddw xmm7, xmm6 |
|
628 |
|
629 sub end_ptr, 1 |
|
630 jne .vp8_sad16x16_sse3_loop |
|
631 |
|
632 movq xmm0, xmm7 |
|
633 psrldq xmm7, 8 |
|
634 paddw xmm0, xmm7 |
|
635 movq rax, xmm0 |
|
636 |
|
637 STACK_FRAME_DESTROY_X3 |
|
638 |
|
639 ;void vp8_copy32xn_sse3( |
|
640 ; unsigned char *src_ptr, |
|
641 ; int src_stride, |
|
642 ; unsigned char *dst_ptr, |
|
643 ; int dst_stride, |
|
644 ; int height); |
|
645 global sym(vp8_copy32xn_sse3) PRIVATE |
|
646 sym(vp8_copy32xn_sse3): |
|
647 |
|
648 STACK_FRAME_CREATE_X3 |
|
649 |
|
650 .block_copy_sse3_loopx4: |
|
651 lea end_ptr, [src_ptr+src_stride*2] |
|
652 |
|
653 movdqu xmm0, XMMWORD PTR [src_ptr] |
|
654 movdqu xmm1, XMMWORD PTR [src_ptr + 16] |
|
655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] |
|
656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] |
|
657 movdqu xmm4, XMMWORD PTR [end_ptr] |
|
658 movdqu xmm5, XMMWORD PTR [end_ptr + 16] |
|
659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] |
|
660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] |
|
661 |
|
662 lea src_ptr, [src_ptr+src_stride*4] |
|
663 |
|
664 lea end_ptr, [ref_ptr+ref_stride*2] |
|
665 |
|
666 movdqa XMMWORD PTR [ref_ptr], xmm0 |
|
667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 |
|
668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 |
|
669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 |
|
670 movdqa XMMWORD PTR [end_ptr], xmm4 |
|
671 movdqa XMMWORD PTR [end_ptr + 16], xmm5 |
|
672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 |
|
673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 |
|
674 |
|
675 lea ref_ptr, [ref_ptr+ref_stride*4] |
|
676 |
|
677 sub height, 4 |
|
678 cmp height, 4 |
|
679 jge .block_copy_sse3_loopx4 |
|
680 |
|
681 ;Check to see if there is more rows need to be copied. |
|
682 cmp height, 0 |
|
683 je .copy_is_done |
|
684 |
|
685 .block_copy_sse3_loop: |
|
686 movdqu xmm0, XMMWORD PTR [src_ptr] |
|
687 movdqu xmm1, XMMWORD PTR [src_ptr + 16] |
|
688 lea src_ptr, [src_ptr+src_stride] |
|
689 |
|
690 movdqa XMMWORD PTR [ref_ptr], xmm0 |
|
691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1 |
|
692 lea ref_ptr, [ref_ptr+ref_stride] |
|
693 |
|
694 sub height, 1 |
|
695 jne .block_copy_sse3_loop |
|
696 |
|
697 .copy_is_done: |
|
698 STACK_FRAME_DESTROY_X3 |
|
699 |
|
700 ;void vp8_sad16x16x4d_sse3( |
|
701 ; unsigned char *src_ptr, |
|
702 ; int src_stride, |
|
703 ; unsigned char *ref_ptr_base, |
|
704 ; int ref_stride, |
|
705 ; int *results) |
|
706 global sym(vp8_sad16x16x4d_sse3) PRIVATE |
|
707 sym(vp8_sad16x16x4d_sse3): |
|
708 |
|
709 STACK_FRAME_CREATE_X4 |
|
710 |
|
711 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
712 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
713 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
714 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
715 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
716 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
717 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
718 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
719 |
|
720 %if ABI_IS_32BIT |
|
721 pop rbp |
|
722 %endif |
|
723 mov rcx, result_ptr |
|
724 |
|
725 movq xmm0, xmm4 |
|
726 psrldq xmm4, 8 |
|
727 |
|
728 paddw xmm0, xmm4 |
|
729 movd [rcx], xmm0 |
|
730 ;- |
|
731 movq xmm0, xmm5 |
|
732 psrldq xmm5, 8 |
|
733 |
|
734 paddw xmm0, xmm5 |
|
735 movd [rcx+4], xmm0 |
|
736 ;- |
|
737 movq xmm0, xmm6 |
|
738 psrldq xmm6, 8 |
|
739 |
|
740 paddw xmm0, xmm6 |
|
741 movd [rcx+8], xmm0 |
|
742 ;- |
|
743 movq xmm0, xmm7 |
|
744 psrldq xmm7, 8 |
|
745 |
|
746 paddw xmm0, xmm7 |
|
747 movd [rcx+12], xmm0 |
|
748 |
|
749 STACK_FRAME_DESTROY_X4 |
|
750 |
|
751 ;void vp8_sad16x8x4d_sse3( |
|
752 ; unsigned char *src_ptr, |
|
753 ; int src_stride, |
|
754 ; unsigned char *ref_ptr_base, |
|
755 ; int ref_stride, |
|
756 ; int *results) |
|
757 global sym(vp8_sad16x8x4d_sse3) PRIVATE |
|
758 sym(vp8_sad16x8x4d_sse3): |
|
759 |
|
760 STACK_FRAME_CREATE_X4 |
|
761 |
|
762 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
763 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
764 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
765 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
766 |
|
767 %if ABI_IS_32BIT |
|
768 pop rbp |
|
769 %endif |
|
770 mov rcx, result_ptr |
|
771 |
|
772 movq xmm0, xmm4 |
|
773 psrldq xmm4, 8 |
|
774 |
|
775 paddw xmm0, xmm4 |
|
776 movd [rcx], xmm0 |
|
777 ;- |
|
778 movq xmm0, xmm5 |
|
779 psrldq xmm5, 8 |
|
780 |
|
781 paddw xmm0, xmm5 |
|
782 movd [rcx+4], xmm0 |
|
783 ;- |
|
784 movq xmm0, xmm6 |
|
785 psrldq xmm6, 8 |
|
786 |
|
787 paddw xmm0, xmm6 |
|
788 movd [rcx+8], xmm0 |
|
789 ;- |
|
790 movq xmm0, xmm7 |
|
791 psrldq xmm7, 8 |
|
792 |
|
793 paddw xmm0, xmm7 |
|
794 movd [rcx+12], xmm0 |
|
795 |
|
796 STACK_FRAME_DESTROY_X4 |
|
797 |
|
798 ;void int vp8_sad8x16x4d_sse3( |
|
799 ; unsigned char *src_ptr, |
|
800 ; int src_stride, |
|
801 ; unsigned char *ref_ptr, |
|
802 ; int ref_stride, |
|
803 ; int *results) |
|
804 global sym(vp8_sad8x16x4d_sse3) PRIVATE |
|
805 sym(vp8_sad8x16x4d_sse3): |
|
806 |
|
807 STACK_FRAME_CREATE_X4 |
|
808 |
|
809 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
810 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
811 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
812 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
813 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
814 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
815 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
816 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
817 |
|
818 %if ABI_IS_32BIT |
|
819 pop rbp |
|
820 %endif |
|
821 mov rcx, result_ptr |
|
822 |
|
823 punpckldq mm4, mm5 |
|
824 punpckldq mm6, mm7 |
|
825 |
|
826 movq [rcx], mm4 |
|
827 movq [rcx+8], mm6 |
|
828 |
|
829 STACK_FRAME_DESTROY_X4 |
|
830 |
|
831 ;void int vp8_sad8x8x4d_sse3( |
|
832 ; unsigned char *src_ptr, |
|
833 ; int src_stride, |
|
834 ; unsigned char *ref_ptr, |
|
835 ; int ref_stride, |
|
836 ; int *results) |
|
837 global sym(vp8_sad8x8x4d_sse3) PRIVATE |
|
838 sym(vp8_sad8x8x4d_sse3): |
|
839 |
|
840 STACK_FRAME_CREATE_X4 |
|
841 |
|
842 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
843 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
844 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
845 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride |
|
846 |
|
847 %if ABI_IS_32BIT |
|
848 pop rbp |
|
849 %endif |
|
850 mov rcx, result_ptr |
|
851 |
|
852 punpckldq mm4, mm5 |
|
853 punpckldq mm6, mm7 |
|
854 |
|
855 movq [rcx], mm4 |
|
856 movq [rcx+8], mm6 |
|
857 |
|
858 STACK_FRAME_DESTROY_X4 |
|
859 |
|
860 ;void int vp8_sad4x4x4d_sse3( |
|
861 ; unsigned char *src_ptr, |
|
862 ; int src_stride, |
|
863 ; unsigned char *ref_ptr, |
|
864 ; int ref_stride, |
|
865 ; int *results) |
|
866 global sym(vp8_sad4x4x4d_sse3) PRIVATE |
|
867 sym(vp8_sad4x4x4d_sse3): |
|
868 |
|
869 STACK_FRAME_CREATE_X4 |
|
870 |
|
871 movd mm0, DWORD PTR [src_ptr] |
|
872 movd mm1, DWORD PTR [r0_ptr] |
|
873 |
|
874 movd mm2, DWORD PTR [src_ptr+src_stride] |
|
875 movd mm3, DWORD PTR [r0_ptr+ref_stride] |
|
876 |
|
877 punpcklbw mm0, mm2 |
|
878 punpcklbw mm1, mm3 |
|
879 |
|
880 movd mm4, DWORD PTR [r1_ptr] |
|
881 movd mm5, DWORD PTR [r2_ptr] |
|
882 |
|
883 movd mm6, DWORD PTR [r3_ptr] |
|
884 movd mm2, DWORD PTR [r1_ptr+ref_stride] |
|
885 |
|
886 movd mm3, DWORD PTR [r2_ptr+ref_stride] |
|
887 movd mm7, DWORD PTR [r3_ptr+ref_stride] |
|
888 |
|
889 psadbw mm1, mm0 |
|
890 |
|
891 punpcklbw mm4, mm2 |
|
892 punpcklbw mm5, mm3 |
|
893 |
|
894 punpcklbw mm6, mm7 |
|
895 psadbw mm4, mm0 |
|
896 |
|
897 psadbw mm5, mm0 |
|
898 psadbw mm6, mm0 |
|
899 |
|
900 |
|
901 |
|
902 lea src_ptr, [src_ptr+src_stride*2] |
|
903 lea r0_ptr, [r0_ptr+ref_stride*2] |
|
904 |
|
905 lea r1_ptr, [r1_ptr+ref_stride*2] |
|
906 lea r2_ptr, [r2_ptr+ref_stride*2] |
|
907 |
|
908 lea r3_ptr, [r3_ptr+ref_stride*2] |
|
909 |
|
910 movd mm0, DWORD PTR [src_ptr] |
|
911 movd mm2, DWORD PTR [r0_ptr] |
|
912 |
|
913 movd mm3, DWORD PTR [src_ptr+src_stride] |
|
914 movd mm7, DWORD PTR [r0_ptr+ref_stride] |
|
915 |
|
916 punpcklbw mm0, mm3 |
|
917 punpcklbw mm2, mm7 |
|
918 |
|
919 movd mm3, DWORD PTR [r1_ptr] |
|
920 movd mm7, DWORD PTR [r2_ptr] |
|
921 |
|
922 psadbw mm2, mm0 |
|
923 %if ABI_IS_32BIT |
|
924 mov rax, rbp |
|
925 |
|
926 pop rbp |
|
927 %define ref_stride rax |
|
928 %endif |
|
929 mov rsi, result_ptr |
|
930 |
|
931 paddw mm1, mm2 |
|
932 movd [rsi], mm1 |
|
933 |
|
934 movd mm2, DWORD PTR [r1_ptr+ref_stride] |
|
935 movd mm1, DWORD PTR [r2_ptr+ref_stride] |
|
936 |
|
937 punpcklbw mm3, mm2 |
|
938 punpcklbw mm7, mm1 |
|
939 |
|
940 psadbw mm3, mm0 |
|
941 psadbw mm7, mm0 |
|
942 |
|
943 movd mm2, DWORD PTR [r3_ptr] |
|
944 movd mm1, DWORD PTR [r3_ptr+ref_stride] |
|
945 |
|
946 paddw mm3, mm4 |
|
947 paddw mm7, mm5 |
|
948 |
|
949 movd [rsi+4], mm3 |
|
950 punpcklbw mm2, mm1 |
|
951 |
|
952 movd [rsi+8], mm7 |
|
953 psadbw mm2, mm0 |
|
954 |
|
955 paddw mm2, mm6 |
|
956 movd [rsi+12], mm2 |
|
957 |
|
958 |
|
959 STACK_FRAME_DESTROY_X4 |
|
960 |