Thu, 15 Jan 2015 15:59:08 +0100
Implement a real Private Browsing Mode condition by changing the API/ABI;
This solves Tor bug #9701, complying with disk avoidance documented in
https://www.torproject.org/projects/torbrowser/design/#disk-avoidance.
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
12 %include "vpx_ports/x86_abi_support.asm"
14 %define xmm_filter_shift 7
16 ;unsigned int vp8_get_mb_ss_sse2
17 ;(
18 ; short *src_ptr
19 ;)
20 global sym(vp8_get_mb_ss_sse2) PRIVATE
21 sym(vp8_get_mb_ss_sse2):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 1
25 GET_GOT rbx
26 push rsi
27 push rdi
28 sub rsp, 16
29 ; end prolog
32 mov rax, arg(0) ;[src_ptr]
33 mov rcx, 8
34 pxor xmm4, xmm4
36 .NEXTROW:
37 movdqa xmm0, [rax]
38 movdqa xmm1, [rax+16]
39 movdqa xmm2, [rax+32]
40 movdqa xmm3, [rax+48]
41 pmaddwd xmm0, xmm0
42 pmaddwd xmm1, xmm1
43 pmaddwd xmm2, xmm2
44 pmaddwd xmm3, xmm3
46 paddd xmm0, xmm1
47 paddd xmm2, xmm3
48 paddd xmm4, xmm0
49 paddd xmm4, xmm2
51 add rax, 0x40
52 dec rcx
53 ja .NEXTROW
55 movdqa xmm3,xmm4
56 psrldq xmm4,8
57 paddd xmm4,xmm3
58 movdqa xmm3,xmm4
59 psrldq xmm4,4
60 paddd xmm4,xmm3
61 movq rax,xmm4
64 ; begin epilog
65 add rsp, 16
66 pop rdi
67 pop rsi
68 RESTORE_GOT
69 UNSHADOW_ARGS
70 pop rbp
71 ret
74 ;unsigned int vp8_get16x16var_sse2
75 ;(
76 ; unsigned char * src_ptr,
77 ; int source_stride,
78 ; unsigned char * ref_ptr,
79 ; int recon_stride,
80 ; unsigned int * SSE,
81 ; int * Sum
82 ;)
83 global sym(vp8_get16x16var_sse2) PRIVATE
84 sym(vp8_get16x16var_sse2):
85 push rbp
86 mov rbp, rsp
87 SHADOW_ARGS_TO_STACK 6
88 SAVE_XMM 7
89 push rbx
90 push rsi
91 push rdi
92 ; end prolog
94 mov rsi, arg(0) ;[src_ptr]
95 mov rdi, arg(2) ;[ref_ptr]
97 movsxd rax, DWORD PTR arg(1) ;[source_stride]
98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
100 ; Prefetch data
101 lea rcx, [rax+rax*2]
102 prefetcht0 [rsi]
103 prefetcht0 [rsi+rax]
104 prefetcht0 [rsi+rax*2]
105 prefetcht0 [rsi+rcx]
106 lea rbx, [rsi+rax*4]
107 prefetcht0 [rbx]
108 prefetcht0 [rbx+rax]
109 prefetcht0 [rbx+rax*2]
110 prefetcht0 [rbx+rcx]
112 lea rcx, [rdx+rdx*2]
113 prefetcht0 [rdi]
114 prefetcht0 [rdi+rdx]
115 prefetcht0 [rdi+rdx*2]
116 prefetcht0 [rdi+rcx]
117 lea rbx, [rdi+rdx*4]
118 prefetcht0 [rbx]
119 prefetcht0 [rbx+rdx]
120 prefetcht0 [rbx+rdx*2]
121 prefetcht0 [rbx+rcx]
123 pxor xmm0, xmm0 ; clear xmm0 for unpack
124 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
126 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
127 mov rcx, 16
129 .var16loop:
130 movdqu xmm1, XMMWORD PTR [rsi]
131 movdqu xmm2, XMMWORD PTR [rdi]
133 prefetcht0 [rsi+rax*8]
134 prefetcht0 [rdi+rdx*8]
136 movdqa xmm3, xmm1
137 movdqa xmm4, xmm2
140 punpcklbw xmm1, xmm0
141 punpckhbw xmm3, xmm0
143 punpcklbw xmm2, xmm0
144 punpckhbw xmm4, xmm0
147 psubw xmm1, xmm2
148 psubw xmm3, xmm4
150 paddw xmm7, xmm1
151 pmaddwd xmm1, xmm1
153 paddw xmm7, xmm3
154 pmaddwd xmm3, xmm3
156 paddd xmm6, xmm1
157 paddd xmm6, xmm3
159 add rsi, rax
160 add rdi, rdx
162 sub rcx, 1
163 jnz .var16loop
166 movdqa xmm1, xmm6
167 pxor xmm6, xmm6
169 pxor xmm5, xmm5
170 punpcklwd xmm6, xmm7
172 punpckhwd xmm5, xmm7
173 psrad xmm5, 16
175 psrad xmm6, 16
176 paddd xmm6, xmm5
178 movdqa xmm2, xmm1
179 punpckldq xmm1, xmm0
181 punpckhdq xmm2, xmm0
182 movdqa xmm7, xmm6
184 paddd xmm1, xmm2
185 punpckldq xmm6, xmm0
187 punpckhdq xmm7, xmm0
188 paddd xmm6, xmm7
190 movdqa xmm2, xmm1
191 movdqa xmm7, xmm6
193 psrldq xmm1, 8
194 psrldq xmm6, 8
196 paddd xmm7, xmm6
197 paddd xmm1, xmm2
199 mov rax, arg(5) ;[Sum]
200 mov rdi, arg(4) ;[SSE]
202 movd DWORD PTR [rax], xmm7
203 movd DWORD PTR [rdi], xmm1
206 ; begin epilog
207 pop rdi
208 pop rsi
209 pop rbx
210 RESTORE_XMM
211 UNSHADOW_ARGS
212 pop rbp
213 ret
218 ;unsigned int vp8_get8x8var_sse2
219 ;(
220 ; unsigned char * src_ptr,
221 ; int source_stride,
222 ; unsigned char * ref_ptr,
223 ; int recon_stride,
224 ; unsigned int * SSE,
225 ; int * Sum
226 ;)
227 global sym(vp8_get8x8var_sse2) PRIVATE
228 sym(vp8_get8x8var_sse2):
229 push rbp
230 mov rbp, rsp
231 SHADOW_ARGS_TO_STACK 6
232 SAVE_XMM 7
233 GET_GOT rbx
234 push rsi
235 push rdi
236 sub rsp, 16
237 ; end prolog
239 mov rsi, arg(0) ;[src_ptr]
240 mov rdi, arg(2) ;[ref_ptr]
242 movsxd rax, DWORD PTR arg(1) ;[source_stride]
243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
245 pxor xmm0, xmm0 ; clear xmm0 for unpack
246 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
248 movq xmm1, QWORD PTR [rsi]
249 movq xmm2, QWORD PTR [rdi]
251 punpcklbw xmm1, xmm0
252 punpcklbw xmm2, xmm0
254 psubsw xmm1, xmm2
255 paddw xmm7, xmm1
257 pmaddwd xmm1, xmm1
259 movq xmm2, QWORD PTR[rsi + rax]
260 movq xmm3, QWORD PTR[rdi + rdx]
262 punpcklbw xmm2, xmm0
263 punpcklbw xmm3, xmm0
265 psubsw xmm2, xmm3
266 paddw xmm7, xmm2
268 pmaddwd xmm2, xmm2
269 paddd xmm1, xmm2
272 movq xmm2, QWORD PTR[rsi + rax * 2]
273 movq xmm3, QWORD PTR[rdi + rdx * 2]
275 punpcklbw xmm2, xmm0
276 punpcklbw xmm3, xmm0
278 psubsw xmm2, xmm3
279 paddw xmm7, xmm2
281 pmaddwd xmm2, xmm2
282 paddd xmm1, xmm2
285 lea rsi, [rsi + rax * 2]
286 lea rdi, [rdi + rdx * 2]
287 movq xmm2, QWORD PTR[rsi + rax]
288 movq xmm3, QWORD PTR[rdi + rdx]
290 punpcklbw xmm2, xmm0
291 punpcklbw xmm3, xmm0
293 psubsw xmm2, xmm3
294 paddw xmm7, xmm2
296 pmaddwd xmm2, xmm2
297 paddd xmm1, xmm2
299 movq xmm2, QWORD PTR[rsi + rax *2]
300 movq xmm3, QWORD PTR[rdi + rdx *2]
302 punpcklbw xmm2, xmm0
303 punpcklbw xmm3, xmm0
305 psubsw xmm2, xmm3
306 paddw xmm7, xmm2
308 pmaddwd xmm2, xmm2
309 paddd xmm1, xmm2
312 lea rsi, [rsi + rax * 2]
313 lea rdi, [rdi + rdx * 2]
316 movq xmm2, QWORD PTR[rsi + rax]
317 movq xmm3, QWORD PTR[rdi + rdx]
319 punpcklbw xmm2, xmm0
320 punpcklbw xmm3, xmm0
322 psubsw xmm2, xmm3
323 paddw xmm7, xmm2
325 pmaddwd xmm2, xmm2
326 paddd xmm1, xmm2
328 movq xmm2, QWORD PTR[rsi + rax *2]
329 movq xmm3, QWORD PTR[rdi + rdx *2]
331 punpcklbw xmm2, xmm0
332 punpcklbw xmm3, xmm0
334 psubsw xmm2, xmm3
335 paddw xmm7, xmm2
337 pmaddwd xmm2, xmm2
338 paddd xmm1, xmm2
341 lea rsi, [rsi + rax * 2]
342 lea rdi, [rdi + rdx * 2]
344 movq xmm2, QWORD PTR[rsi + rax]
345 movq xmm3, QWORD PTR[rdi + rdx]
347 punpcklbw xmm2, xmm0
348 punpcklbw xmm3, xmm0
350 psubsw xmm2, xmm3
351 paddw xmm7, xmm2
353 pmaddwd xmm2, xmm2
354 paddd xmm1, xmm2
357 movdqa xmm6, xmm7
358 punpcklwd xmm6, xmm0
360 punpckhwd xmm7, xmm0
361 movdqa xmm2, xmm1
363 paddw xmm6, xmm7
364 punpckldq xmm1, xmm0
366 punpckhdq xmm2, xmm0
367 movdqa xmm7, xmm6
369 paddd xmm1, xmm2
370 punpckldq xmm6, xmm0
372 punpckhdq xmm7, xmm0
373 paddw xmm6, xmm7
375 movdqa xmm2, xmm1
376 movdqa xmm7, xmm6
378 psrldq xmm1, 8
379 psrldq xmm6, 8
381 paddw xmm7, xmm6
382 paddd xmm1, xmm2
384 mov rax, arg(5) ;[Sum]
385 mov rdi, arg(4) ;[SSE]
387 movq rdx, xmm7
388 movsx rcx, dx
390 mov dword ptr [rax], ecx
391 movd DWORD PTR [rdi], xmm1
393 ; begin epilog
394 add rsp, 16
395 pop rdi
396 pop rsi
397 RESTORE_GOT
398 RESTORE_XMM
399 UNSHADOW_ARGS
400 pop rbp
401 ret
403 ;void vp8_filter_block2d_bil_var_sse2
404 ;(
405 ; unsigned char *ref_ptr,
406 ; int ref_pixels_per_line,
407 ; unsigned char *src_ptr,
408 ; int src_pixels_per_line,
409 ; unsigned int Height,
410 ; int xoffset,
411 ; int yoffset,
412 ; int *sum,
413 ; unsigned int *sumsquared;;
414 ;
415 ;)
416 global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
417 sym(vp8_filter_block2d_bil_var_sse2):
418 push rbp
419 mov rbp, rsp
420 SHADOW_ARGS_TO_STACK 9
421 SAVE_XMM 7
422 GET_GOT rbx
423 push rsi
424 push rdi
425 push rbx
426 ; end prolog
428 pxor xmm6, xmm6 ;
429 pxor xmm7, xmm7 ;
431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
432 movdqa xmm4, XMMWORD PTR [rsi]
434 lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
435 movsxd rax, dword ptr arg(5) ; xoffset
437 cmp rax, 0 ; skip first_pass filter if xoffset=0
438 je filter_block2d_bil_var_sse2_sp_only
440 shl rax, 5 ; point to filter coeff with xoffset
441 lea rax, [rax + rcx] ; HFilter
443 movsxd rdx, dword ptr arg(6) ; yoffset
445 cmp rdx, 0 ; skip second_pass filter if yoffset=0
446 je filter_block2d_bil_var_sse2_fp_only
448 shl rdx, 5
449 lea rdx, [rdx + rcx] ; VFilter
451 mov rsi, arg(0) ;ref_ptr
452 mov rdi, arg(2) ;src_ptr
453 movsxd rcx, dword ptr arg(4) ;Height
455 pxor xmm0, xmm0 ;
456 movq xmm1, QWORD PTR [rsi] ;
457 movq xmm3, QWORD PTR [rsi+1] ;
459 punpcklbw xmm1, xmm0 ;
460 pmullw xmm1, [rax] ;
461 punpcklbw xmm3, xmm0
462 pmullw xmm3, [rax+16] ;
464 paddw xmm1, xmm3 ;
465 paddw xmm1, xmm4 ;
466 psraw xmm1, xmm_filter_shift ;
467 movdqa xmm5, xmm1
469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
470 lea rsi, [rsi + rbx]
471 %if ABI_IS_32BIT=0
472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
473 %endif
475 filter_block2d_bil_var_sse2_loop:
476 movq xmm1, QWORD PTR [rsi] ;
477 movq xmm3, QWORD PTR [rsi+1] ;
479 punpcklbw xmm1, xmm0 ;
480 pmullw xmm1, [rax] ;
481 punpcklbw xmm3, xmm0 ;
482 pmullw xmm3, [rax+16] ;
484 paddw xmm1, xmm3 ;
485 paddw xmm1, xmm4 ;
486 psraw xmm1, xmm_filter_shift ;
488 movdqa xmm3, xmm5 ;
489 movdqa xmm5, xmm1 ;
491 pmullw xmm3, [rdx] ;
492 pmullw xmm1, [rdx+16] ;
493 paddw xmm1, xmm3 ;
494 paddw xmm1, xmm4 ;
495 psraw xmm1, xmm_filter_shift ;
497 movq xmm3, QWORD PTR [rdi] ;
498 punpcklbw xmm3, xmm0 ;
500 psubw xmm1, xmm3 ;
501 paddw xmm6, xmm1 ;
503 pmaddwd xmm1, xmm1 ;
504 paddd xmm7, xmm1 ;
506 lea rsi, [rsi + rbx] ;ref_pixels_per_line
507 %if ABI_IS_32BIT
508 add rdi, dword ptr arg(3) ;src_pixels_per_line
509 %else
510 lea rdi, [rdi + r9]
511 %endif
513 sub rcx, 1 ;
514 jnz filter_block2d_bil_var_sse2_loop ;
516 jmp filter_block2d_bil_variance
518 filter_block2d_bil_var_sse2_sp_only:
519 movsxd rdx, dword ptr arg(6) ; yoffset
521 cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
522 je filter_block2d_bil_var_sse2_full_pixel
524 shl rdx, 5
525 lea rdx, [rdx + rcx] ; VFilter
527 mov rsi, arg(0) ;ref_ptr
528 mov rdi, arg(2) ;src_ptr
529 movsxd rcx, dword ptr arg(4) ;Height
530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
532 pxor xmm0, xmm0 ;
533 movq xmm1, QWORD PTR [rsi] ;
534 punpcklbw xmm1, xmm0 ;
536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
537 lea rsi, [rsi + rax]
539 filter_block2d_bil_sp_only_loop:
540 movq xmm3, QWORD PTR [rsi] ;
541 punpcklbw xmm3, xmm0 ;
542 movdqa xmm5, xmm3
544 pmullw xmm1, [rdx] ;
545 pmullw xmm3, [rdx+16] ;
546 paddw xmm1, xmm3 ;
547 paddw xmm1, xmm4 ;
548 psraw xmm1, xmm_filter_shift ;
550 movq xmm3, QWORD PTR [rdi] ;
551 punpcklbw xmm3, xmm0 ;
553 psubw xmm1, xmm3 ;
554 paddw xmm6, xmm1 ;
556 pmaddwd xmm1, xmm1 ;
557 paddd xmm7, xmm1 ;
559 movdqa xmm1, xmm5 ;
560 lea rsi, [rsi + rax] ;ref_pixels_per_line
561 lea rdi, [rdi + rbx] ;src_pixels_per_line
563 sub rcx, 1 ;
564 jnz filter_block2d_bil_sp_only_loop ;
566 jmp filter_block2d_bil_variance
568 filter_block2d_bil_var_sse2_full_pixel:
569 mov rsi, arg(0) ;ref_ptr
570 mov rdi, arg(2) ;src_ptr
571 movsxd rcx, dword ptr arg(4) ;Height
572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
574 pxor xmm0, xmm0 ;
576 filter_block2d_bil_full_pixel_loop:
577 movq xmm1, QWORD PTR [rsi] ;
578 punpcklbw xmm1, xmm0 ;
580 movq xmm2, QWORD PTR [rdi] ;
581 punpcklbw xmm2, xmm0 ;
583 psubw xmm1, xmm2 ;
584 paddw xmm6, xmm1 ;
586 pmaddwd xmm1, xmm1 ;
587 paddd xmm7, xmm1 ;
589 lea rsi, [rsi + rax] ;ref_pixels_per_line
590 lea rdi, [rdi + rbx] ;src_pixels_per_line
592 sub rcx, 1 ;
593 jnz filter_block2d_bil_full_pixel_loop ;
595 jmp filter_block2d_bil_variance
597 filter_block2d_bil_var_sse2_fp_only:
598 mov rsi, arg(0) ;ref_ptr
599 mov rdi, arg(2) ;src_ptr
600 movsxd rcx, dword ptr arg(4) ;Height
601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
603 pxor xmm0, xmm0 ;
604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
606 filter_block2d_bil_fp_only_loop:
607 movq xmm1, QWORD PTR [rsi] ;
608 movq xmm3, QWORD PTR [rsi+1] ;
610 punpcklbw xmm1, xmm0 ;
611 pmullw xmm1, [rax] ;
612 punpcklbw xmm3, xmm0 ;
613 pmullw xmm3, [rax+16] ;
615 paddw xmm1, xmm3 ;
616 paddw xmm1, xmm4 ;
617 psraw xmm1, xmm_filter_shift ;
619 movq xmm3, QWORD PTR [rdi] ;
620 punpcklbw xmm3, xmm0 ;
622 psubw xmm1, xmm3 ;
623 paddw xmm6, xmm1 ;
625 pmaddwd xmm1, xmm1 ;
626 paddd xmm7, xmm1 ;
627 lea rsi, [rsi + rdx]
628 lea rdi, [rdi + rbx] ;src_pixels_per_line
630 sub rcx, 1 ;
631 jnz filter_block2d_bil_fp_only_loop ;
633 jmp filter_block2d_bil_variance
635 filter_block2d_bil_variance:
636 movdq2q mm6, xmm6 ;
637 movdq2q mm7, xmm7 ;
639 psrldq xmm6, 8
640 psrldq xmm7, 8
642 movdq2q mm2, xmm6
643 movdq2q mm3, xmm7
645 paddw mm6, mm2
646 paddd mm7, mm3
648 pxor mm3, mm3 ;
649 pxor mm2, mm2 ;
651 punpcklwd mm2, mm6 ;
652 punpckhwd mm3, mm6 ;
654 paddd mm2, mm3 ;
655 movq mm6, mm2 ;
657 psrlq mm6, 32 ;
658 paddd mm2, mm6 ;
660 psrad mm2, 16 ;
661 movq mm4, mm7 ;
663 psrlq mm4, 32 ;
664 paddd mm4, mm7 ;
666 mov rsi, arg(7) ; sum
667 mov rdi, arg(8) ; sumsquared
669 movd [rsi], mm2 ; xsum
670 movd [rdi], mm4 ; xxsum
672 ; begin epilog
673 pop rbx
674 pop rdi
675 pop rsi
676 RESTORE_GOT
677 RESTORE_XMM
678 UNSHADOW_ARGS
679 pop rbp
680 ret
683 ;void vp8_half_horiz_vert_variance8x_h_sse2
684 ;(
685 ; unsigned char *ref_ptr,
686 ; int ref_pixels_per_line,
687 ; unsigned char *src_ptr,
688 ; int src_pixels_per_line,
689 ; unsigned int Height,
690 ; int *sum,
691 ; unsigned int *sumsquared
692 ;)
693 global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
694 sym(vp8_half_horiz_vert_variance8x_h_sse2):
695 push rbp
696 mov rbp, rsp
697 SHADOW_ARGS_TO_STACK 7
698 SAVE_XMM 7
699 GET_GOT rbx
700 push rsi
701 push rdi
702 ; end prolog
704 %if ABI_IS_32BIT=0
705 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
706 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
707 %endif
709 pxor xmm6, xmm6 ; error accumulator
710 pxor xmm7, xmm7 ; sse eaccumulator
711 mov rsi, arg(0) ;ref_ptr ;
713 mov rdi, arg(2) ;src_ptr ;
714 movsxd rcx, dword ptr arg(4) ;Height ;
715 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
717 pxor xmm0, xmm0 ;
719 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
720 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
721 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
723 %if ABI_IS_32BIT
724 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
725 %else
726 add rsi, r8
727 %endif
729 vp8_half_horiz_vert_variance8x_h_1:
731 movq xmm1, QWORD PTR [rsi] ;
732 movq xmm2, QWORD PTR [rsi+1] ;
733 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
735 pavgb xmm5, xmm1 ; xmm = vertical average of the above
736 punpcklbw xmm5, xmm0 ; xmm5 = words of above
738 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
739 punpcklbw xmm3, xmm0 ; xmm3 = words of above
741 psubw xmm5, xmm3 ; xmm5 -= xmm3
742 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
743 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
744 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
746 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
748 %if ABI_IS_32BIT
749 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
750 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
751 %else
752 add rsi, r8
753 add rdi, r9
754 %endif
756 sub rcx, 1 ;
757 jnz vp8_half_horiz_vert_variance8x_h_1 ;
759 movdq2q mm6, xmm6 ;
760 movdq2q mm7, xmm7 ;
762 psrldq xmm6, 8
763 psrldq xmm7, 8
765 movdq2q mm2, xmm6
766 movdq2q mm3, xmm7
768 paddw mm6, mm2
769 paddd mm7, mm3
771 pxor mm3, mm3 ;
772 pxor mm2, mm2 ;
774 punpcklwd mm2, mm6 ;
775 punpckhwd mm3, mm6 ;
777 paddd mm2, mm3 ;
778 movq mm6, mm2 ;
780 psrlq mm6, 32 ;
781 paddd mm2, mm6 ;
783 psrad mm2, 16 ;
784 movq mm4, mm7 ;
786 psrlq mm4, 32 ;
787 paddd mm4, mm7 ;
789 mov rsi, arg(5) ; sum
790 mov rdi, arg(6) ; sumsquared
792 movd [rsi], mm2 ;
793 movd [rdi], mm4 ;
796 ; begin epilog
797 pop rdi
798 pop rsi
799 RESTORE_GOT
800 RESTORE_XMM
801 UNSHADOW_ARGS
802 pop rbp
803 ret
805 ;void vp8_half_horiz_vert_variance16x_h_sse2
806 ;(
807 ; unsigned char *ref_ptr,
808 ; int ref_pixels_per_line,
809 ; unsigned char *src_ptr,
810 ; int src_pixels_per_line,
811 ; unsigned int Height,
812 ; int *sum,
813 ; unsigned int *sumsquared
814 ;)
815 global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
816 sym(vp8_half_horiz_vert_variance16x_h_sse2):
817 push rbp
818 mov rbp, rsp
819 SHADOW_ARGS_TO_STACK 7
820 SAVE_XMM 7
821 GET_GOT rbx
822 push rsi
823 push rdi
824 ; end prolog
826 pxor xmm6, xmm6 ; error accumulator
827 pxor xmm7, xmm7 ; sse eaccumulator
828 mov rsi, arg(0) ;ref_ptr ;
830 mov rdi, arg(2) ;src_ptr ;
831 movsxd rcx, dword ptr arg(4) ;Height ;
832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
835 pxor xmm0, xmm0 ;
837 movdqu xmm5, XMMWORD PTR [rsi]
838 movdqu xmm3, XMMWORD PTR [rsi+1]
839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
841 lea rsi, [rsi + rax]
843 vp8_half_horiz_vert_variance16x_h_1:
844 movdqu xmm1, XMMWORD PTR [rsi] ;
845 movdqu xmm2, XMMWORD PTR [rsi+1] ;
846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
848 pavgb xmm5, xmm1 ; xmm = vertical average of the above
850 movdqa xmm4, xmm5
851 punpcklbw xmm5, xmm0 ; xmm5 = words of above
852 punpckhbw xmm4, xmm0
854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
855 punpcklbw xmm3, xmm0 ; xmm3 = words of above
856 psubw xmm5, xmm3 ; xmm5 -= xmm3
858 movq xmm3, QWORD PTR [rdi+8]
859 punpcklbw xmm3, xmm0
860 psubw xmm4, xmm3
862 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
863 paddw xmm6, xmm4
864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
865 pmaddwd xmm4, xmm4
866 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
867 paddd xmm7, xmm4
869 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
871 lea rsi, [rsi + rax]
872 lea rdi, [rdi + rdx]
874 sub rcx, 1 ;
875 jnz vp8_half_horiz_vert_variance16x_h_1 ;
877 pxor xmm1, xmm1
878 pxor xmm5, xmm5
880 punpcklwd xmm0, xmm6
881 punpckhwd xmm1, xmm6
882 psrad xmm0, 16
883 psrad xmm1, 16
884 paddd xmm0, xmm1
885 movdqa xmm1, xmm0
887 movdqa xmm6, xmm7
888 punpckldq xmm6, xmm5
889 punpckhdq xmm7, xmm5
890 paddd xmm6, xmm7
892 punpckldq xmm0, xmm5
893 punpckhdq xmm1, xmm5
894 paddd xmm0, xmm1
896 movdqa xmm7, xmm6
897 movdqa xmm1, xmm0
899 psrldq xmm7, 8
900 psrldq xmm1, 8
902 paddd xmm6, xmm7
903 paddd xmm0, xmm1
905 mov rsi, arg(5) ;[Sum]
906 mov rdi, arg(6) ;[SSE]
908 movd [rsi], xmm0
909 movd [rdi], xmm6
911 ; begin epilog
912 pop rdi
913 pop rsi
914 RESTORE_GOT
915 RESTORE_XMM
916 UNSHADOW_ARGS
917 pop rbp
918 ret
921 ;void vp8_half_vert_variance8x_h_sse2
922 ;(
923 ; unsigned char *ref_ptr,
924 ; int ref_pixels_per_line,
925 ; unsigned char *src_ptr,
926 ; int src_pixels_per_line,
927 ; unsigned int Height,
928 ; int *sum,
929 ; unsigned int *sumsquared
930 ;)
931 global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
932 sym(vp8_half_vert_variance8x_h_sse2):
933 push rbp
934 mov rbp, rsp
935 SHADOW_ARGS_TO_STACK 7
936 SAVE_XMM 7
937 GET_GOT rbx
938 push rsi
939 push rdi
940 ; end prolog
942 %if ABI_IS_32BIT=0
943 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
944 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
945 %endif
947 pxor xmm6, xmm6 ; error accumulator
948 pxor xmm7, xmm7 ; sse eaccumulator
949 mov rsi, arg(0) ;ref_ptr ;
951 mov rdi, arg(2) ;src_ptr ;
952 movsxd rcx, dword ptr arg(4) ;Height ;
953 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
955 pxor xmm0, xmm0 ;
956 vp8_half_vert_variance8x_h_1:
957 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
958 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
960 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
961 punpcklbw xmm5, xmm0 ; xmm5 = words of above
963 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
964 punpcklbw xmm3, xmm0 ; xmm3 = words of above
966 psubw xmm5, xmm3 ; xmm5 -= xmm3
967 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
968 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
969 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
971 %if ABI_IS_32BIT
972 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
973 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
974 %else
975 add rsi, r8
976 add rdi, r9
977 %endif
979 sub rcx, 1 ;
980 jnz vp8_half_vert_variance8x_h_1 ;
982 movdq2q mm6, xmm6 ;
983 movdq2q mm7, xmm7 ;
985 psrldq xmm6, 8
986 psrldq xmm7, 8
988 movdq2q mm2, xmm6
989 movdq2q mm3, xmm7
991 paddw mm6, mm2
992 paddd mm7, mm3
994 pxor mm3, mm3 ;
995 pxor mm2, mm2 ;
997 punpcklwd mm2, mm6 ;
998 punpckhwd mm3, mm6 ;
1000 paddd mm2, mm3 ;
1001 movq mm6, mm2 ;
1003 psrlq mm6, 32 ;
1004 paddd mm2, mm6 ;
1006 psrad mm2, 16 ;
1007 movq mm4, mm7 ;
1009 psrlq mm4, 32 ;
1010 paddd mm4, mm7 ;
1012 mov rsi, arg(5) ; sum
1013 mov rdi, arg(6) ; sumsquared
1015 movd [rsi], mm2 ;
1016 movd [rdi], mm4 ;
1019 ; begin epilog
1020 pop rdi
1021 pop rsi
1022 RESTORE_GOT
1023 RESTORE_XMM
1024 UNSHADOW_ARGS
1025 pop rbp
1026 ret
1028 ;void vp8_half_vert_variance16x_h_sse2
1029 ;(
1030 ; unsigned char *ref_ptr,
1031 ; int ref_pixels_per_line,
1032 ; unsigned char *src_ptr,
1033 ; int src_pixels_per_line,
1034 ; unsigned int Height,
1035 ; int *sum,
1036 ; unsigned int *sumsquared
1037 ;)
1038 global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
1039 sym(vp8_half_vert_variance16x_h_sse2):
1040 push rbp
1041 mov rbp, rsp
1042 SHADOW_ARGS_TO_STACK 7
1043 SAVE_XMM 7
1044 GET_GOT rbx
1045 push rsi
1046 push rdi
1047 ; end prolog
1049 pxor xmm6, xmm6 ; error accumulator
1050 pxor xmm7, xmm7 ; sse eaccumulator
1051 mov rsi, arg(0) ;ref_ptr
1053 mov rdi, arg(2) ;src_ptr
1054 movsxd rcx, dword ptr arg(4) ;Height
1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
1058 movdqu xmm5, XMMWORD PTR [rsi]
1059 lea rsi, [rsi + rax ]
1060 pxor xmm0, xmm0
1062 vp8_half_vert_variance16x_h_1:
1063 movdqu xmm3, XMMWORD PTR [rsi]
1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
1066 movdqa xmm4, xmm5
1067 punpcklbw xmm5, xmm0
1068 punpckhbw xmm4, xmm0
1070 movq xmm2, QWORD PTR [rdi]
1071 punpcklbw xmm2, xmm0
1072 psubw xmm5, xmm2
1073 movq xmm2, QWORD PTR [rdi+8]
1074 punpcklbw xmm2, xmm0
1075 psubw xmm4, xmm2
1077 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
1078 paddw xmm6, xmm4
1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1080 pmaddwd xmm4, xmm4
1081 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
1082 paddd xmm7, xmm4
1084 movdqa xmm5, xmm3
1086 lea rsi, [rsi + rax]
1087 lea rdi, [rdi + rdx]
1089 sub rcx, 1
1090 jnz vp8_half_vert_variance16x_h_1
1092 pxor xmm1, xmm1
1093 pxor xmm5, xmm5
1095 punpcklwd xmm0, xmm6
1096 punpckhwd xmm1, xmm6
1097 psrad xmm0, 16
1098 psrad xmm1, 16
1099 paddd xmm0, xmm1
1100 movdqa xmm1, xmm0
1102 movdqa xmm6, xmm7
1103 punpckldq xmm6, xmm5
1104 punpckhdq xmm7, xmm5
1105 paddd xmm6, xmm7
1107 punpckldq xmm0, xmm5
1108 punpckhdq xmm1, xmm5
1109 paddd xmm0, xmm1
1111 movdqa xmm7, xmm6
1112 movdqa xmm1, xmm0
1114 psrldq xmm7, 8
1115 psrldq xmm1, 8
1117 paddd xmm6, xmm7
1118 paddd xmm0, xmm1
1120 mov rsi, arg(5) ;[Sum]
1121 mov rdi, arg(6) ;[SSE]
1123 movd [rsi], xmm0
1124 movd [rdi], xmm6
1126 ; begin epilog
1127 pop rdi
1128 pop rsi
1129 RESTORE_GOT
1130 RESTORE_XMM
1131 UNSHADOW_ARGS
1132 pop rbp
1133 ret
1136 ;void vp8_half_horiz_variance8x_h_sse2
1137 ;(
1138 ; unsigned char *ref_ptr,
1139 ; int ref_pixels_per_line,
1140 ; unsigned char *src_ptr,
1141 ; int src_pixels_per_line,
1142 ; unsigned int Height,
1143 ; int *sum,
1144 ; unsigned int *sumsquared
1145 ;)
1146 global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
1147 sym(vp8_half_horiz_variance8x_h_sse2):
1148 push rbp
1149 mov rbp, rsp
1150 SHADOW_ARGS_TO_STACK 7
1151 SAVE_XMM 7
1152 GET_GOT rbx
1153 push rsi
1154 push rdi
1155 ; end prolog
1157 %if ABI_IS_32BIT=0
1158 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
1159 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
1160 %endif
1162 pxor xmm6, xmm6 ; error accumulator
1163 pxor xmm7, xmm7 ; sse eaccumulator
1164 mov rsi, arg(0) ;ref_ptr ;
1166 mov rdi, arg(2) ;src_ptr ;
1167 movsxd rcx, dword ptr arg(4) ;Height ;
1169 pxor xmm0, xmm0 ;
1170 vp8_half_horiz_variance8x_h_1:
1171 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
1172 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
1174 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
1175 punpcklbw xmm5, xmm0 ; xmm5 = words of above
1177 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
1178 punpcklbw xmm3, xmm0 ; xmm3 = words of above
1180 psubw xmm5, xmm3 ; xmm5 -= xmm3
1181 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
1182 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1183 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
1185 %if ABI_IS_32BIT
1186 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
1187 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
1188 %else
1189 add rsi, r8
1190 add rdi, r9
1191 %endif
1192 sub rcx, 1 ;
1193 jnz vp8_half_horiz_variance8x_h_1 ;
1195 movdq2q mm6, xmm6 ;
1196 movdq2q mm7, xmm7 ;
1198 psrldq xmm6, 8
1199 psrldq xmm7, 8
1201 movdq2q mm2, xmm6
1202 movdq2q mm3, xmm7
1204 paddw mm6, mm2
1205 paddd mm7, mm3
1207 pxor mm3, mm3 ;
1208 pxor mm2, mm2 ;
1210 punpcklwd mm2, mm6 ;
1211 punpckhwd mm3, mm6 ;
1213 paddd mm2, mm3 ;
1214 movq mm6, mm2 ;
1216 psrlq mm6, 32 ;
1217 paddd mm2, mm6 ;
1219 psrad mm2, 16 ;
1220 movq mm4, mm7 ;
1222 psrlq mm4, 32 ;
1223 paddd mm4, mm7 ;
1225 mov rsi, arg(5) ; sum
1226 mov rdi, arg(6) ; sumsquared
1228 movd [rsi], mm2 ;
1229 movd [rdi], mm4 ;
1232 ; begin epilog
1233 pop rdi
1234 pop rsi
1235 RESTORE_GOT
1236 RESTORE_XMM
1237 UNSHADOW_ARGS
1238 pop rbp
1239 ret
1241 ;void vp8_half_horiz_variance16x_h_sse2
1242 ;(
1243 ; unsigned char *ref_ptr,
1244 ; int ref_pixels_per_line,
1245 ; unsigned char *src_ptr,
1246 ; int src_pixels_per_line,
1247 ; unsigned int Height,
1248 ; int *sum,
1249 ; unsigned int *sumsquared
1250 ;)
1251 global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
1252 sym(vp8_half_horiz_variance16x_h_sse2):
1253 push rbp
1254 mov rbp, rsp
1255 SHADOW_ARGS_TO_STACK 7
1256 SAVE_XMM 7
1257 GET_GOT rbx
1258 push rsi
1259 push rdi
1260 ; end prolog
1262 pxor xmm6, xmm6 ; error accumulator
1263 pxor xmm7, xmm7 ; sse eaccumulator
1264 mov rsi, arg(0) ;ref_ptr ;
1266 mov rdi, arg(2) ;src_ptr ;
1267 movsxd rcx, dword ptr arg(4) ;Height ;
1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
1271 pxor xmm0, xmm0 ;
1273 vp8_half_horiz_variance16x_h_1:
1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
1278 movdqa xmm1, xmm5
1279 punpcklbw xmm5, xmm0 ; xmm5 = words of above
1280 punpckhbw xmm1, xmm0
1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
1283 punpcklbw xmm3, xmm0 ; xmm3 = words of above
1284 movq xmm2, QWORD PTR [rdi+8]
1285 punpcklbw xmm2, xmm0
1287 psubw xmm5, xmm3 ; xmm5 -= xmm3
1288 psubw xmm1, xmm2
1289 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
1290 paddw xmm6, xmm1
1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1292 pmaddwd xmm1, xmm1
1293 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
1294 paddd xmm7, xmm1
1296 lea rsi, [rsi + rax]
1297 lea rdi, [rdi + rdx]
1299 sub rcx, 1 ;
1300 jnz vp8_half_horiz_variance16x_h_1 ;
1302 pxor xmm1, xmm1
1303 pxor xmm5, xmm5
1305 punpcklwd xmm0, xmm6
1306 punpckhwd xmm1, xmm6
1307 psrad xmm0, 16
1308 psrad xmm1, 16
1309 paddd xmm0, xmm1
1310 movdqa xmm1, xmm0
1312 movdqa xmm6, xmm7
1313 punpckldq xmm6, xmm5
1314 punpckhdq xmm7, xmm5
1315 paddd xmm6, xmm7
1317 punpckldq xmm0, xmm5
1318 punpckhdq xmm1, xmm5
1319 paddd xmm0, xmm1
1321 movdqa xmm7, xmm6
1322 movdqa xmm1, xmm0
1324 psrldq xmm7, 8
1325 psrldq xmm1, 8
1327 paddd xmm6, xmm7
1328 paddd xmm0, xmm1
1330 mov rsi, arg(5) ;[Sum]
1331 mov rdi, arg(6) ;[SSE]
1333 movd [rsi], xmm0
1334 movd [rdi], xmm6
1336 ; begin epilog
1337 pop rdi
1338 pop rsi
1339 RESTORE_GOT
1340 RESTORE_XMM
1341 UNSHADOW_ARGS
1342 pop rbp
1343 ret
1345 SECTION_RODATA
1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
1347 align 16
1348 xmm_bi_rd:
1349 times 8 dw 64
1350 align 16
1351 vp8_bilinear_filters_sse2:
1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112