Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
11 %include "vpx_ports/x86_abi_support.asm"
13 %macro STACK_FRAME_CREATE_X3 0
14 %if ABI_IS_32BIT
15 %define src_ptr rsi
16 %define src_stride rax
17 %define ref_ptr rdi
18 %define ref_stride rdx
19 %define end_ptr rcx
20 %define ret_var rbx
21 %define result_ptr arg(4)
22 %define max_sad arg(4)
23 %define height dword ptr arg(4)
24 push rbp
25 mov rbp, rsp
26 push rsi
27 push rdi
28 push rbx
30 mov rsi, arg(0) ; src_ptr
31 mov rdi, arg(2) ; ref_ptr
33 movsxd rax, dword ptr arg(1) ; src_stride
34 movsxd rdx, dword ptr arg(3) ; ref_stride
35 %else
36 %if LIBVPX_YASM_WIN64
37 SAVE_XMM 7, u
38 %define src_ptr rcx
39 %define src_stride rdx
40 %define ref_ptr r8
41 %define ref_stride r9
42 %define end_ptr r10
43 %define ret_var r11
44 %define result_ptr [rsp+xmm_stack_space+8+4*8]
45 %define max_sad [rsp+xmm_stack_space+8+4*8]
46 %define height dword ptr [rsp+xmm_stack_space+8+4*8]
47 %else
48 %define src_ptr rdi
49 %define src_stride rsi
50 %define ref_ptr rdx
51 %define ref_stride rcx
52 %define end_ptr r9
53 %define ret_var r10
54 %define result_ptr r8
55 %define max_sad r8
56 %define height r8
57 %endif
58 %endif
60 %endmacro
62 %macro STACK_FRAME_DESTROY_X3 0
63 %define src_ptr
64 %define src_stride
65 %define ref_ptr
66 %define ref_stride
67 %define end_ptr
68 %define ret_var
69 %define result_ptr
70 %define max_sad
71 %define height
73 %if ABI_IS_32BIT
74 pop rbx
75 pop rdi
76 pop rsi
77 pop rbp
78 %else
79 %if LIBVPX_YASM_WIN64
80 RESTORE_XMM
81 %endif
82 %endif
83 ret
84 %endmacro
86 %macro STACK_FRAME_CREATE_X4 0
87 %if ABI_IS_32BIT
88 %define src_ptr rsi
89 %define src_stride rax
90 %define r0_ptr rcx
91 %define r1_ptr rdx
92 %define r2_ptr rbx
93 %define r3_ptr rdi
94 %define ref_stride rbp
95 %define result_ptr arg(4)
96 push rbp
97 mov rbp, rsp
98 push rsi
99 push rdi
100 push rbx
102 push rbp
103 mov rdi, arg(2) ; ref_ptr_base
105 LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi
107 mov rsi, arg(0) ; src_ptr
109 movsxd rbx, dword ptr arg(1) ; src_stride
110 movsxd rbp, dword ptr arg(3) ; ref_stride
112 xchg rbx, rax
113 %else
114 %if LIBVPX_YASM_WIN64
115 SAVE_XMM 7, u
116 %define src_ptr rcx
117 %define src_stride rdx
118 %define r0_ptr rsi
119 %define r1_ptr r10
120 %define r2_ptr r11
121 %define r3_ptr r8
122 %define ref_stride r9
123 %define result_ptr [rsp+xmm_stack_space+16+4*8]
124 push rsi
126 LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr
127 %else
128 %define src_ptr rdi
129 %define src_stride rsi
130 %define r0_ptr r9
131 %define r1_ptr r10
132 %define r2_ptr r11
133 %define r3_ptr rdx
134 %define ref_stride rcx
135 %define result_ptr r8
137 LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr
139 %endif
140 %endif
141 %endmacro
143 %macro STACK_FRAME_DESTROY_X4 0
144 %define src_ptr
145 %define src_stride
146 %define r0_ptr
147 %define r1_ptr
148 %define r2_ptr
149 %define r3_ptr
150 %define ref_stride
151 %define result_ptr
153 %if ABI_IS_32BIT
154 pop rbx
155 pop rdi
156 pop rsi
157 pop rbp
158 %else
159 %if LIBVPX_YASM_WIN64
160 pop rsi
161 RESTORE_XMM
162 %endif
163 %endif
164 ret
165 %endmacro
167 %macro PROCESS_16X2X3 5
168 %if %1==0
169 movdqa xmm0, XMMWORD PTR [%2]
170 lddqu xmm5, XMMWORD PTR [%3]
171 lddqu xmm6, XMMWORD PTR [%3+1]
172 lddqu xmm7, XMMWORD PTR [%3+2]
174 psadbw xmm5, xmm0
175 psadbw xmm6, xmm0
176 psadbw xmm7, xmm0
177 %else
178 movdqa xmm0, XMMWORD PTR [%2]
179 lddqu xmm1, XMMWORD PTR [%3]
180 lddqu xmm2, XMMWORD PTR [%3+1]
181 lddqu xmm3, XMMWORD PTR [%3+2]
183 psadbw xmm1, xmm0
184 psadbw xmm2, xmm0
185 psadbw xmm3, xmm0
187 paddw xmm5, xmm1
188 paddw xmm6, xmm2
189 paddw xmm7, xmm3
190 %endif
191 movdqa xmm0, XMMWORD PTR [%2+%4]
192 lddqu xmm1, XMMWORD PTR [%3+%5]
193 lddqu xmm2, XMMWORD PTR [%3+%5+1]
194 lddqu xmm3, XMMWORD PTR [%3+%5+2]
196 %if %1==0 || %1==1
197 lea %2, [%2+%4*2]
198 lea %3, [%3+%5*2]
199 %endif
201 psadbw xmm1, xmm0
202 psadbw xmm2, xmm0
203 psadbw xmm3, xmm0
205 paddw xmm5, xmm1
206 paddw xmm6, xmm2
207 paddw xmm7, xmm3
208 %endmacro
210 %macro PROCESS_8X2X3 5
211 %if %1==0
212 movq mm0, QWORD PTR [%2]
213 movq mm5, QWORD PTR [%3]
214 movq mm6, QWORD PTR [%3+1]
215 movq mm7, QWORD PTR [%3+2]
217 psadbw mm5, mm0
218 psadbw mm6, mm0
219 psadbw mm7, mm0
220 %else
221 movq mm0, QWORD PTR [%2]
222 movq mm1, QWORD PTR [%3]
223 movq mm2, QWORD PTR [%3+1]
224 movq mm3, QWORD PTR [%3+2]
226 psadbw mm1, mm0
227 psadbw mm2, mm0
228 psadbw mm3, mm0
230 paddw mm5, mm1
231 paddw mm6, mm2
232 paddw mm7, mm3
233 %endif
234 movq mm0, QWORD PTR [%2+%4]
235 movq mm1, QWORD PTR [%3+%5]
236 movq mm2, QWORD PTR [%3+%5+1]
237 movq mm3, QWORD PTR [%3+%5+2]
239 %if %1==0 || %1==1
240 lea %2, [%2+%4*2]
241 lea %3, [%3+%5*2]
242 %endif
244 psadbw mm1, mm0
245 psadbw mm2, mm0
246 psadbw mm3, mm0
248 paddw mm5, mm1
249 paddw mm6, mm2
250 paddw mm7, mm3
251 %endmacro
253 %macro LOAD_X4_ADDRESSES 5
254 mov %2, [%1+REG_SZ_BYTES*0]
255 mov %3, [%1+REG_SZ_BYTES*1]
257 mov %4, [%1+REG_SZ_BYTES*2]
258 mov %5, [%1+REG_SZ_BYTES*3]
259 %endmacro
261 %macro PROCESS_16X2X4 8
262 %if %1==0
263 movdqa xmm0, XMMWORD PTR [%2]
264 lddqu xmm4, XMMWORD PTR [%3]
265 lddqu xmm5, XMMWORD PTR [%4]
266 lddqu xmm6, XMMWORD PTR [%5]
267 lddqu xmm7, XMMWORD PTR [%6]
269 psadbw xmm4, xmm0
270 psadbw xmm5, xmm0
271 psadbw xmm6, xmm0
272 psadbw xmm7, xmm0
273 %else
274 movdqa xmm0, XMMWORD PTR [%2]
275 lddqu xmm1, XMMWORD PTR [%3]
276 lddqu xmm2, XMMWORD PTR [%4]
277 lddqu xmm3, XMMWORD PTR [%5]
279 psadbw xmm1, xmm0
280 psadbw xmm2, xmm0
281 psadbw xmm3, xmm0
283 paddw xmm4, xmm1
284 lddqu xmm1, XMMWORD PTR [%6]
285 paddw xmm5, xmm2
286 paddw xmm6, xmm3
288 psadbw xmm1, xmm0
289 paddw xmm7, xmm1
290 %endif
291 movdqa xmm0, XMMWORD PTR [%2+%7]
292 lddqu xmm1, XMMWORD PTR [%3+%8]
293 lddqu xmm2, XMMWORD PTR [%4+%8]
294 lddqu xmm3, XMMWORD PTR [%5+%8]
296 psadbw xmm1, xmm0
297 psadbw xmm2, xmm0
298 psadbw xmm3, xmm0
300 paddw xmm4, xmm1
301 lddqu xmm1, XMMWORD PTR [%6+%8]
302 paddw xmm5, xmm2
303 paddw xmm6, xmm3
305 %if %1==0 || %1==1
306 lea %2, [%2+%7*2]
307 lea %3, [%3+%8*2]
309 lea %4, [%4+%8*2]
310 lea %5, [%5+%8*2]
312 lea %6, [%6+%8*2]
313 %endif
314 psadbw xmm1, xmm0
315 paddw xmm7, xmm1
317 %endmacro
319 %macro PROCESS_8X2X4 8
320 %if %1==0
321 movq mm0, QWORD PTR [%2]
322 movq mm4, QWORD PTR [%3]
323 movq mm5, QWORD PTR [%4]
324 movq mm6, QWORD PTR [%5]
325 movq mm7, QWORD PTR [%6]
327 psadbw mm4, mm0
328 psadbw mm5, mm0
329 psadbw mm6, mm0
330 psadbw mm7, mm0
331 %else
332 movq mm0, QWORD PTR [%2]
333 movq mm1, QWORD PTR [%3]
334 movq mm2, QWORD PTR [%4]
335 movq mm3, QWORD PTR [%5]
337 psadbw mm1, mm0
338 psadbw mm2, mm0
339 psadbw mm3, mm0
341 paddw mm4, mm1
342 movq mm1, QWORD PTR [%6]
343 paddw mm5, mm2
344 paddw mm6, mm3
346 psadbw mm1, mm0
347 paddw mm7, mm1
348 %endif
349 movq mm0, QWORD PTR [%2+%7]
350 movq mm1, QWORD PTR [%3+%8]
351 movq mm2, QWORD PTR [%4+%8]
352 movq mm3, QWORD PTR [%5+%8]
354 psadbw mm1, mm0
355 psadbw mm2, mm0
356 psadbw mm3, mm0
358 paddw mm4, mm1
359 movq mm1, QWORD PTR [%6+%8]
360 paddw mm5, mm2
361 paddw mm6, mm3
363 %if %1==0 || %1==1
364 lea %2, [%2+%7*2]
365 lea %3, [%3+%8*2]
367 lea %4, [%4+%8*2]
368 lea %5, [%5+%8*2]
370 lea %6, [%6+%8*2]
371 %endif
372 psadbw mm1, mm0
373 paddw mm7, mm1
375 %endmacro
377 ;void int vp8_sad16x16x3_sse3(
378 ; unsigned char *src_ptr,
379 ; int src_stride,
380 ; unsigned char *ref_ptr,
381 ; int ref_stride,
382 ; int *results)
383 global sym(vp8_sad16x16x3_sse3) PRIVATE
384 sym(vp8_sad16x16x3_sse3):
386 STACK_FRAME_CREATE_X3
388 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
389 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
390 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
391 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
392 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
393 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
394 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
395 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
397 mov rcx, result_ptr
399 movq xmm0, xmm5
400 psrldq xmm5, 8
402 paddw xmm0, xmm5
403 movd [rcx], xmm0
404 ;-
405 movq xmm0, xmm6
406 psrldq xmm6, 8
408 paddw xmm0, xmm6
409 movd [rcx+4], xmm0
410 ;-
411 movq xmm0, xmm7
412 psrldq xmm7, 8
414 paddw xmm0, xmm7
415 movd [rcx+8], xmm0
417 STACK_FRAME_DESTROY_X3
419 ;void int vp8_sad16x8x3_sse3(
420 ; unsigned char *src_ptr,
421 ; int src_stride,
422 ; unsigned char *ref_ptr,
423 ; int ref_stride,
424 ; int *results)
425 global sym(vp8_sad16x8x3_sse3) PRIVATE
426 sym(vp8_sad16x8x3_sse3):
428 STACK_FRAME_CREATE_X3
430 PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
431 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
432 PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
433 PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
435 mov rcx, result_ptr
437 movq xmm0, xmm5
438 psrldq xmm5, 8
440 paddw xmm0, xmm5
441 movd [rcx], xmm0
442 ;-
443 movq xmm0, xmm6
444 psrldq xmm6, 8
446 paddw xmm0, xmm6
447 movd [rcx+4], xmm0
448 ;-
449 movq xmm0, xmm7
450 psrldq xmm7, 8
452 paddw xmm0, xmm7
453 movd [rcx+8], xmm0
455 STACK_FRAME_DESTROY_X3
457 ;void int vp8_sad8x16x3_sse3(
458 ; unsigned char *src_ptr,
459 ; int src_stride,
460 ; unsigned char *ref_ptr,
461 ; int ref_stride,
462 ; int *results)
463 global sym(vp8_sad8x16x3_sse3) PRIVATE
464 sym(vp8_sad8x16x3_sse3):
466 STACK_FRAME_CREATE_X3
468 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
469 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
470 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
471 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
472 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
473 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
474 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
475 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
477 mov rcx, result_ptr
479 punpckldq mm5, mm6
481 movq [rcx], mm5
482 movd [rcx+8], mm7
484 STACK_FRAME_DESTROY_X3
486 ;void int vp8_sad8x8x3_sse3(
487 ; unsigned char *src_ptr,
488 ; int src_stride,
489 ; unsigned char *ref_ptr,
490 ; int ref_stride,
491 ; int *results)
492 global sym(vp8_sad8x8x3_sse3) PRIVATE
493 sym(vp8_sad8x8x3_sse3):
495 STACK_FRAME_CREATE_X3
497 PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
498 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
499 PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
500 PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
502 mov rcx, result_ptr
504 punpckldq mm5, mm6
506 movq [rcx], mm5
507 movd [rcx+8], mm7
509 STACK_FRAME_DESTROY_X3
511 ;void int vp8_sad4x4x3_sse3(
512 ; unsigned char *src_ptr,
513 ; int src_stride,
514 ; unsigned char *ref_ptr,
515 ; int ref_stride,
516 ; int *results)
517 global sym(vp8_sad4x4x3_sse3) PRIVATE
518 sym(vp8_sad4x4x3_sse3):
520 STACK_FRAME_CREATE_X3
522 movd mm0, DWORD PTR [src_ptr]
523 movd mm1, DWORD PTR [ref_ptr]
525 movd mm2, DWORD PTR [src_ptr+src_stride]
526 movd mm3, DWORD PTR [ref_ptr+ref_stride]
528 punpcklbw mm0, mm2
529 punpcklbw mm1, mm3
531 movd mm4, DWORD PTR [ref_ptr+1]
532 movd mm5, DWORD PTR [ref_ptr+2]
534 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
535 movd mm3, DWORD PTR [ref_ptr+ref_stride+2]
537 psadbw mm1, mm0
539 punpcklbw mm4, mm2
540 punpcklbw mm5, mm3
542 psadbw mm4, mm0
543 psadbw mm5, mm0
545 lea src_ptr, [src_ptr+src_stride*2]
546 lea ref_ptr, [ref_ptr+ref_stride*2]
548 movd mm0, DWORD PTR [src_ptr]
549 movd mm2, DWORD PTR [ref_ptr]
551 movd mm3, DWORD PTR [src_ptr+src_stride]
552 movd mm6, DWORD PTR [ref_ptr+ref_stride]
554 punpcklbw mm0, mm3
555 punpcklbw mm2, mm6
557 movd mm3, DWORD PTR [ref_ptr+1]
558 movd mm7, DWORD PTR [ref_ptr+2]
560 psadbw mm2, mm0
562 paddw mm1, mm2
564 movd mm2, DWORD PTR [ref_ptr+ref_stride+1]
565 movd mm6, DWORD PTR [ref_ptr+ref_stride+2]
567 punpcklbw mm3, mm2
568 punpcklbw mm7, mm6
570 psadbw mm3, mm0
571 psadbw mm7, mm0
573 paddw mm3, mm4
574 paddw mm7, mm5
576 mov rcx, result_ptr
578 punpckldq mm1, mm3
580 movq [rcx], mm1
581 movd [rcx+8], mm7
583 STACK_FRAME_DESTROY_X3
585 ;unsigned int vp8_sad16x16_sse3(
586 ; unsigned char *src_ptr,
587 ; int src_stride,
588 ; unsigned char *ref_ptr,
589 ; int ref_stride,
590 ; int max_sad)
591 ;%define lddqu movdqu
592 global sym(vp8_sad16x16_sse3) PRIVATE
593 sym(vp8_sad16x16_sse3):
595 STACK_FRAME_CREATE_X3
597 mov end_ptr, 4
598 pxor xmm7, xmm7
600 .vp8_sad16x16_sse3_loop:
601 movdqa xmm0, XMMWORD PTR [src_ptr]
602 movdqu xmm1, XMMWORD PTR [ref_ptr]
603 movdqa xmm2, XMMWORD PTR [src_ptr+src_stride]
604 movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride]
606 lea src_ptr, [src_ptr+src_stride*2]
607 lea ref_ptr, [ref_ptr+ref_stride*2]
609 movdqa xmm4, XMMWORD PTR [src_ptr]
610 movdqu xmm5, XMMWORD PTR [ref_ptr]
611 movdqa xmm6, XMMWORD PTR [src_ptr+src_stride]
613 psadbw xmm0, xmm1
615 movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride]
617 psadbw xmm2, xmm3
618 psadbw xmm4, xmm5
619 psadbw xmm6, xmm1
621 lea src_ptr, [src_ptr+src_stride*2]
622 lea ref_ptr, [ref_ptr+ref_stride*2]
624 paddw xmm7, xmm0
625 paddw xmm7, xmm2
626 paddw xmm7, xmm4
627 paddw xmm7, xmm6
629 sub end_ptr, 1
630 jne .vp8_sad16x16_sse3_loop
632 movq xmm0, xmm7
633 psrldq xmm7, 8
634 paddw xmm0, xmm7
635 movq rax, xmm0
637 STACK_FRAME_DESTROY_X3
639 ;void vp8_copy32xn_sse3(
640 ; unsigned char *src_ptr,
641 ; int src_stride,
642 ; unsigned char *dst_ptr,
643 ; int dst_stride,
644 ; int height);
645 global sym(vp8_copy32xn_sse3) PRIVATE
646 sym(vp8_copy32xn_sse3):
648 STACK_FRAME_CREATE_X3
650 .block_copy_sse3_loopx4:
651 lea end_ptr, [src_ptr+src_stride*2]
653 movdqu xmm0, XMMWORD PTR [src_ptr]
654 movdqu xmm1, XMMWORD PTR [src_ptr + 16]
655 movdqu xmm2, XMMWORD PTR [src_ptr + src_stride]
656 movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16]
657 movdqu xmm4, XMMWORD PTR [end_ptr]
658 movdqu xmm5, XMMWORD PTR [end_ptr + 16]
659 movdqu xmm6, XMMWORD PTR [end_ptr + src_stride]
660 movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16]
662 lea src_ptr, [src_ptr+src_stride*4]
664 lea end_ptr, [ref_ptr+ref_stride*2]
666 movdqa XMMWORD PTR [ref_ptr], xmm0
667 movdqa XMMWORD PTR [ref_ptr + 16], xmm1
668 movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2
669 movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3
670 movdqa XMMWORD PTR [end_ptr], xmm4
671 movdqa XMMWORD PTR [end_ptr + 16], xmm5
672 movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6
673 movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7
675 lea ref_ptr, [ref_ptr+ref_stride*4]
677 sub height, 4
678 cmp height, 4
679 jge .block_copy_sse3_loopx4
681 ;Check to see if there is more rows need to be copied.
682 cmp height, 0
683 je .copy_is_done
685 .block_copy_sse3_loop:
686 movdqu xmm0, XMMWORD PTR [src_ptr]
687 movdqu xmm1, XMMWORD PTR [src_ptr + 16]
688 lea src_ptr, [src_ptr+src_stride]
690 movdqa XMMWORD PTR [ref_ptr], xmm0
691 movdqa XMMWORD PTR [ref_ptr + 16], xmm1
692 lea ref_ptr, [ref_ptr+ref_stride]
694 sub height, 1
695 jne .block_copy_sse3_loop
697 .copy_is_done:
698 STACK_FRAME_DESTROY_X3
700 ;void vp8_sad16x16x4d_sse3(
701 ; unsigned char *src_ptr,
702 ; int src_stride,
703 ; unsigned char *ref_ptr_base,
704 ; int ref_stride,
705 ; int *results)
706 global sym(vp8_sad16x16x4d_sse3) PRIVATE
707 sym(vp8_sad16x16x4d_sse3):
709 STACK_FRAME_CREATE_X4
711 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
712 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
713 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
714 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
715 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
716 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
717 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
718 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
720 %if ABI_IS_32BIT
721 pop rbp
722 %endif
723 mov rcx, result_ptr
725 movq xmm0, xmm4
726 psrldq xmm4, 8
728 paddw xmm0, xmm4
729 movd [rcx], xmm0
730 ;-
731 movq xmm0, xmm5
732 psrldq xmm5, 8
734 paddw xmm0, xmm5
735 movd [rcx+4], xmm0
736 ;-
737 movq xmm0, xmm6
738 psrldq xmm6, 8
740 paddw xmm0, xmm6
741 movd [rcx+8], xmm0
742 ;-
743 movq xmm0, xmm7
744 psrldq xmm7, 8
746 paddw xmm0, xmm7
747 movd [rcx+12], xmm0
749 STACK_FRAME_DESTROY_X4
751 ;void vp8_sad16x8x4d_sse3(
752 ; unsigned char *src_ptr,
753 ; int src_stride,
754 ; unsigned char *ref_ptr_base,
755 ; int ref_stride,
756 ; int *results)
757 global sym(vp8_sad16x8x4d_sse3) PRIVATE
758 sym(vp8_sad16x8x4d_sse3):
760 STACK_FRAME_CREATE_X4
762 PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
763 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
764 PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
765 PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
767 %if ABI_IS_32BIT
768 pop rbp
769 %endif
770 mov rcx, result_ptr
772 movq xmm0, xmm4
773 psrldq xmm4, 8
775 paddw xmm0, xmm4
776 movd [rcx], xmm0
777 ;-
778 movq xmm0, xmm5
779 psrldq xmm5, 8
781 paddw xmm0, xmm5
782 movd [rcx+4], xmm0
783 ;-
784 movq xmm0, xmm6
785 psrldq xmm6, 8
787 paddw xmm0, xmm6
788 movd [rcx+8], xmm0
789 ;-
790 movq xmm0, xmm7
791 psrldq xmm7, 8
793 paddw xmm0, xmm7
794 movd [rcx+12], xmm0
796 STACK_FRAME_DESTROY_X4
798 ;void int vp8_sad8x16x4d_sse3(
799 ; unsigned char *src_ptr,
800 ; int src_stride,
801 ; unsigned char *ref_ptr,
802 ; int ref_stride,
803 ; int *results)
804 global sym(vp8_sad8x16x4d_sse3) PRIVATE
805 sym(vp8_sad8x16x4d_sse3):
807 STACK_FRAME_CREATE_X4
809 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
810 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
811 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
812 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
813 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
814 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
815 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
816 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
818 %if ABI_IS_32BIT
819 pop rbp
820 %endif
821 mov rcx, result_ptr
823 punpckldq mm4, mm5
824 punpckldq mm6, mm7
826 movq [rcx], mm4
827 movq [rcx+8], mm6
829 STACK_FRAME_DESTROY_X4
831 ;void int vp8_sad8x8x4d_sse3(
832 ; unsigned char *src_ptr,
833 ; int src_stride,
834 ; unsigned char *ref_ptr,
835 ; int ref_stride,
836 ; int *results)
837 global sym(vp8_sad8x8x4d_sse3) PRIVATE
838 sym(vp8_sad8x8x4d_sse3):
840 STACK_FRAME_CREATE_X4
842 PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
843 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
844 PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
845 PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride
847 %if ABI_IS_32BIT
848 pop rbp
849 %endif
850 mov rcx, result_ptr
852 punpckldq mm4, mm5
853 punpckldq mm6, mm7
855 movq [rcx], mm4
856 movq [rcx+8], mm6
858 STACK_FRAME_DESTROY_X4
860 ;void int vp8_sad4x4x4d_sse3(
861 ; unsigned char *src_ptr,
862 ; int src_stride,
863 ; unsigned char *ref_ptr,
864 ; int ref_stride,
865 ; int *results)
866 global sym(vp8_sad4x4x4d_sse3) PRIVATE
867 sym(vp8_sad4x4x4d_sse3):
869 STACK_FRAME_CREATE_X4
871 movd mm0, DWORD PTR [src_ptr]
872 movd mm1, DWORD PTR [r0_ptr]
874 movd mm2, DWORD PTR [src_ptr+src_stride]
875 movd mm3, DWORD PTR [r0_ptr+ref_stride]
877 punpcklbw mm0, mm2
878 punpcklbw mm1, mm3
880 movd mm4, DWORD PTR [r1_ptr]
881 movd mm5, DWORD PTR [r2_ptr]
883 movd mm6, DWORD PTR [r3_ptr]
884 movd mm2, DWORD PTR [r1_ptr+ref_stride]
886 movd mm3, DWORD PTR [r2_ptr+ref_stride]
887 movd mm7, DWORD PTR [r3_ptr+ref_stride]
889 psadbw mm1, mm0
891 punpcklbw mm4, mm2
892 punpcklbw mm5, mm3
894 punpcklbw mm6, mm7
895 psadbw mm4, mm0
897 psadbw mm5, mm0
898 psadbw mm6, mm0
902 lea src_ptr, [src_ptr+src_stride*2]
903 lea r0_ptr, [r0_ptr+ref_stride*2]
905 lea r1_ptr, [r1_ptr+ref_stride*2]
906 lea r2_ptr, [r2_ptr+ref_stride*2]
908 lea r3_ptr, [r3_ptr+ref_stride*2]
910 movd mm0, DWORD PTR [src_ptr]
911 movd mm2, DWORD PTR [r0_ptr]
913 movd mm3, DWORD PTR [src_ptr+src_stride]
914 movd mm7, DWORD PTR [r0_ptr+ref_stride]
916 punpcklbw mm0, mm3
917 punpcklbw mm2, mm7
919 movd mm3, DWORD PTR [r1_ptr]
920 movd mm7, DWORD PTR [r2_ptr]
922 psadbw mm2, mm0
923 %if ABI_IS_32BIT
924 mov rax, rbp
926 pop rbp
927 %define ref_stride rax
928 %endif
929 mov rsi, result_ptr
931 paddw mm1, mm2
932 movd [rsi], mm1
934 movd mm2, DWORD PTR [r1_ptr+ref_stride]
935 movd mm1, DWORD PTR [r2_ptr+ref_stride]
937 punpcklbw mm3, mm2
938 punpcklbw mm7, mm1
940 psadbw mm3, mm0
941 psadbw mm7, mm0
943 movd mm2, DWORD PTR [r3_ptr]
944 movd mm1, DWORD PTR [r3_ptr+ref_stride]
946 paddw mm3, mm4
947 paddw mm7, mm5
949 movd [rsi+4], mm3
950 punpcklbw mm2, mm1
952 movd [rsi+8], mm7
953 psadbw mm2, mm0
955 paddw mm2, mm6
956 movd [rsi+12], mm2
959 STACK_FRAME_DESTROY_X4