media/libvpx/vp8/common/x86/variance_impl_sse2.asm

branch
TOR_BUG_9701
changeset 10
ac0c01689b40
equal deleted inserted replaced
-1:000000000000 0:a12acc43b26f
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 %define xmm_filter_shift 7
15
16 ;unsigned int vp8_get_mb_ss_sse2
17 ;(
18 ; short *src_ptr
19 ;)
20 global sym(vp8_get_mb_ss_sse2) PRIVATE
21 sym(vp8_get_mb_ss_sse2):
22 push rbp
23 mov rbp, rsp
24 SHADOW_ARGS_TO_STACK 1
25 GET_GOT rbx
26 push rsi
27 push rdi
28 sub rsp, 16
29 ; end prolog
30
31
32 mov rax, arg(0) ;[src_ptr]
33 mov rcx, 8
34 pxor xmm4, xmm4
35
36 .NEXTROW:
37 movdqa xmm0, [rax]
38 movdqa xmm1, [rax+16]
39 movdqa xmm2, [rax+32]
40 movdqa xmm3, [rax+48]
41 pmaddwd xmm0, xmm0
42 pmaddwd xmm1, xmm1
43 pmaddwd xmm2, xmm2
44 pmaddwd xmm3, xmm3
45
46 paddd xmm0, xmm1
47 paddd xmm2, xmm3
48 paddd xmm4, xmm0
49 paddd xmm4, xmm2
50
51 add rax, 0x40
52 dec rcx
53 ja .NEXTROW
54
55 movdqa xmm3,xmm4
56 psrldq xmm4,8
57 paddd xmm4,xmm3
58 movdqa xmm3,xmm4
59 psrldq xmm4,4
60 paddd xmm4,xmm3
61 movq rax,xmm4
62
63
64 ; begin epilog
65 add rsp, 16
66 pop rdi
67 pop rsi
68 RESTORE_GOT
69 UNSHADOW_ARGS
70 pop rbp
71 ret
72
73
74 ;unsigned int vp8_get16x16var_sse2
75 ;(
76 ; unsigned char * src_ptr,
77 ; int source_stride,
78 ; unsigned char * ref_ptr,
79 ; int recon_stride,
80 ; unsigned int * SSE,
81 ; int * Sum
82 ;)
83 global sym(vp8_get16x16var_sse2) PRIVATE
84 sym(vp8_get16x16var_sse2):
85 push rbp
86 mov rbp, rsp
87 SHADOW_ARGS_TO_STACK 6
88 SAVE_XMM 7
89 push rbx
90 push rsi
91 push rdi
92 ; end prolog
93
94 mov rsi, arg(0) ;[src_ptr]
95 mov rdi, arg(2) ;[ref_ptr]
96
97 movsxd rax, DWORD PTR arg(1) ;[source_stride]
98 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
99
100 ; Prefetch data
101 lea rcx, [rax+rax*2]
102 prefetcht0 [rsi]
103 prefetcht0 [rsi+rax]
104 prefetcht0 [rsi+rax*2]
105 prefetcht0 [rsi+rcx]
106 lea rbx, [rsi+rax*4]
107 prefetcht0 [rbx]
108 prefetcht0 [rbx+rax]
109 prefetcht0 [rbx+rax*2]
110 prefetcht0 [rbx+rcx]
111
112 lea rcx, [rdx+rdx*2]
113 prefetcht0 [rdi]
114 prefetcht0 [rdi+rdx]
115 prefetcht0 [rdi+rdx*2]
116 prefetcht0 [rdi+rcx]
117 lea rbx, [rdi+rdx*4]
118 prefetcht0 [rbx]
119 prefetcht0 [rbx+rdx]
120 prefetcht0 [rbx+rdx*2]
121 prefetcht0 [rbx+rcx]
122
123 pxor xmm0, xmm0 ; clear xmm0 for unpack
124 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
125
126 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
127 mov rcx, 16
128
129 .var16loop:
130 movdqu xmm1, XMMWORD PTR [rsi]
131 movdqu xmm2, XMMWORD PTR [rdi]
132
133 prefetcht0 [rsi+rax*8]
134 prefetcht0 [rdi+rdx*8]
135
136 movdqa xmm3, xmm1
137 movdqa xmm4, xmm2
138
139
140 punpcklbw xmm1, xmm0
141 punpckhbw xmm3, xmm0
142
143 punpcklbw xmm2, xmm0
144 punpckhbw xmm4, xmm0
145
146
147 psubw xmm1, xmm2
148 psubw xmm3, xmm4
149
150 paddw xmm7, xmm1
151 pmaddwd xmm1, xmm1
152
153 paddw xmm7, xmm3
154 pmaddwd xmm3, xmm3
155
156 paddd xmm6, xmm1
157 paddd xmm6, xmm3
158
159 add rsi, rax
160 add rdi, rdx
161
162 sub rcx, 1
163 jnz .var16loop
164
165
166 movdqa xmm1, xmm6
167 pxor xmm6, xmm6
168
169 pxor xmm5, xmm5
170 punpcklwd xmm6, xmm7
171
172 punpckhwd xmm5, xmm7
173 psrad xmm5, 16
174
175 psrad xmm6, 16
176 paddd xmm6, xmm5
177
178 movdqa xmm2, xmm1
179 punpckldq xmm1, xmm0
180
181 punpckhdq xmm2, xmm0
182 movdqa xmm7, xmm6
183
184 paddd xmm1, xmm2
185 punpckldq xmm6, xmm0
186
187 punpckhdq xmm7, xmm0
188 paddd xmm6, xmm7
189
190 movdqa xmm2, xmm1
191 movdqa xmm7, xmm6
192
193 psrldq xmm1, 8
194 psrldq xmm6, 8
195
196 paddd xmm7, xmm6
197 paddd xmm1, xmm2
198
199 mov rax, arg(5) ;[Sum]
200 mov rdi, arg(4) ;[SSE]
201
202 movd DWORD PTR [rax], xmm7
203 movd DWORD PTR [rdi], xmm1
204
205
206 ; begin epilog
207 pop rdi
208 pop rsi
209 pop rbx
210 RESTORE_XMM
211 UNSHADOW_ARGS
212 pop rbp
213 ret
214
215
216
217
218 ;unsigned int vp8_get8x8var_sse2
219 ;(
220 ; unsigned char * src_ptr,
221 ; int source_stride,
222 ; unsigned char * ref_ptr,
223 ; int recon_stride,
224 ; unsigned int * SSE,
225 ; int * Sum
226 ;)
227 global sym(vp8_get8x8var_sse2) PRIVATE
228 sym(vp8_get8x8var_sse2):
229 push rbp
230 mov rbp, rsp
231 SHADOW_ARGS_TO_STACK 6
232 SAVE_XMM 7
233 GET_GOT rbx
234 push rsi
235 push rdi
236 sub rsp, 16
237 ; end prolog
238
239 mov rsi, arg(0) ;[src_ptr]
240 mov rdi, arg(2) ;[ref_ptr]
241
242 movsxd rax, DWORD PTR arg(1) ;[source_stride]
243 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
244
245 pxor xmm0, xmm0 ; clear xmm0 for unpack
246 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
247
248 movq xmm1, QWORD PTR [rsi]
249 movq xmm2, QWORD PTR [rdi]
250
251 punpcklbw xmm1, xmm0
252 punpcklbw xmm2, xmm0
253
254 psubsw xmm1, xmm2
255 paddw xmm7, xmm1
256
257 pmaddwd xmm1, xmm1
258
259 movq xmm2, QWORD PTR[rsi + rax]
260 movq xmm3, QWORD PTR[rdi + rdx]
261
262 punpcklbw xmm2, xmm0
263 punpcklbw xmm3, xmm0
264
265 psubsw xmm2, xmm3
266 paddw xmm7, xmm2
267
268 pmaddwd xmm2, xmm2
269 paddd xmm1, xmm2
270
271
272 movq xmm2, QWORD PTR[rsi + rax * 2]
273 movq xmm3, QWORD PTR[rdi + rdx * 2]
274
275 punpcklbw xmm2, xmm0
276 punpcklbw xmm3, xmm0
277
278 psubsw xmm2, xmm3
279 paddw xmm7, xmm2
280
281 pmaddwd xmm2, xmm2
282 paddd xmm1, xmm2
283
284
285 lea rsi, [rsi + rax * 2]
286 lea rdi, [rdi + rdx * 2]
287 movq xmm2, QWORD PTR[rsi + rax]
288 movq xmm3, QWORD PTR[rdi + rdx]
289
290 punpcklbw xmm2, xmm0
291 punpcklbw xmm3, xmm0
292
293 psubsw xmm2, xmm3
294 paddw xmm7, xmm2
295
296 pmaddwd xmm2, xmm2
297 paddd xmm1, xmm2
298
299 movq xmm2, QWORD PTR[rsi + rax *2]
300 movq xmm3, QWORD PTR[rdi + rdx *2]
301
302 punpcklbw xmm2, xmm0
303 punpcklbw xmm3, xmm0
304
305 psubsw xmm2, xmm3
306 paddw xmm7, xmm2
307
308 pmaddwd xmm2, xmm2
309 paddd xmm1, xmm2
310
311
312 lea rsi, [rsi + rax * 2]
313 lea rdi, [rdi + rdx * 2]
314
315
316 movq xmm2, QWORD PTR[rsi + rax]
317 movq xmm3, QWORD PTR[rdi + rdx]
318
319 punpcklbw xmm2, xmm0
320 punpcklbw xmm3, xmm0
321
322 psubsw xmm2, xmm3
323 paddw xmm7, xmm2
324
325 pmaddwd xmm2, xmm2
326 paddd xmm1, xmm2
327
328 movq xmm2, QWORD PTR[rsi + rax *2]
329 movq xmm3, QWORD PTR[rdi + rdx *2]
330
331 punpcklbw xmm2, xmm0
332 punpcklbw xmm3, xmm0
333
334 psubsw xmm2, xmm3
335 paddw xmm7, xmm2
336
337 pmaddwd xmm2, xmm2
338 paddd xmm1, xmm2
339
340
341 lea rsi, [rsi + rax * 2]
342 lea rdi, [rdi + rdx * 2]
343
344 movq xmm2, QWORD PTR[rsi + rax]
345 movq xmm3, QWORD PTR[rdi + rdx]
346
347 punpcklbw xmm2, xmm0
348 punpcklbw xmm3, xmm0
349
350 psubsw xmm2, xmm3
351 paddw xmm7, xmm2
352
353 pmaddwd xmm2, xmm2
354 paddd xmm1, xmm2
355
356
357 movdqa xmm6, xmm7
358 punpcklwd xmm6, xmm0
359
360 punpckhwd xmm7, xmm0
361 movdqa xmm2, xmm1
362
363 paddw xmm6, xmm7
364 punpckldq xmm1, xmm0
365
366 punpckhdq xmm2, xmm0
367 movdqa xmm7, xmm6
368
369 paddd xmm1, xmm2
370 punpckldq xmm6, xmm0
371
372 punpckhdq xmm7, xmm0
373 paddw xmm6, xmm7
374
375 movdqa xmm2, xmm1
376 movdqa xmm7, xmm6
377
378 psrldq xmm1, 8
379 psrldq xmm6, 8
380
381 paddw xmm7, xmm6
382 paddd xmm1, xmm2
383
384 mov rax, arg(5) ;[Sum]
385 mov rdi, arg(4) ;[SSE]
386
387 movq rdx, xmm7
388 movsx rcx, dx
389
390 mov dword ptr [rax], ecx
391 movd DWORD PTR [rdi], xmm1
392
393 ; begin epilog
394 add rsp, 16
395 pop rdi
396 pop rsi
397 RESTORE_GOT
398 RESTORE_XMM
399 UNSHADOW_ARGS
400 pop rbp
401 ret
402
403 ;void vp8_filter_block2d_bil_var_sse2
404 ;(
405 ; unsigned char *ref_ptr,
406 ; int ref_pixels_per_line,
407 ; unsigned char *src_ptr,
408 ; int src_pixels_per_line,
409 ; unsigned int Height,
410 ; int xoffset,
411 ; int yoffset,
412 ; int *sum,
413 ; unsigned int *sumsquared;;
414 ;
415 ;)
416 global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE
417 sym(vp8_filter_block2d_bil_var_sse2):
418 push rbp
419 mov rbp, rsp
420 SHADOW_ARGS_TO_STACK 9
421 SAVE_XMM 7
422 GET_GOT rbx
423 push rsi
424 push rdi
425 push rbx
426 ; end prolog
427
428 pxor xmm6, xmm6 ;
429 pxor xmm7, xmm7 ;
430
431 lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding
432 movdqa xmm4, XMMWORD PTR [rsi]
433
434 lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)]
435 movsxd rax, dword ptr arg(5) ; xoffset
436
437 cmp rax, 0 ; skip first_pass filter if xoffset=0
438 je filter_block2d_bil_var_sse2_sp_only
439
440 shl rax, 5 ; point to filter coeff with xoffset
441 lea rax, [rax + rcx] ; HFilter
442
443 movsxd rdx, dword ptr arg(6) ; yoffset
444
445 cmp rdx, 0 ; skip second_pass filter if yoffset=0
446 je filter_block2d_bil_var_sse2_fp_only
447
448 shl rdx, 5
449 lea rdx, [rdx + rcx] ; VFilter
450
451 mov rsi, arg(0) ;ref_ptr
452 mov rdi, arg(2) ;src_ptr
453 movsxd rcx, dword ptr arg(4) ;Height
454
455 pxor xmm0, xmm0 ;
456 movq xmm1, QWORD PTR [rsi] ;
457 movq xmm3, QWORD PTR [rsi+1] ;
458
459 punpcklbw xmm1, xmm0 ;
460 pmullw xmm1, [rax] ;
461 punpcklbw xmm3, xmm0
462 pmullw xmm3, [rax+16] ;
463
464 paddw xmm1, xmm3 ;
465 paddw xmm1, xmm4 ;
466 psraw xmm1, xmm_filter_shift ;
467 movdqa xmm5, xmm1
468
469 movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line
470 lea rsi, [rsi + rbx]
471 %if ABI_IS_32BIT=0
472 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
473 %endif
474
475 filter_block2d_bil_var_sse2_loop:
476 movq xmm1, QWORD PTR [rsi] ;
477 movq xmm3, QWORD PTR [rsi+1] ;
478
479 punpcklbw xmm1, xmm0 ;
480 pmullw xmm1, [rax] ;
481 punpcklbw xmm3, xmm0 ;
482 pmullw xmm3, [rax+16] ;
483
484 paddw xmm1, xmm3 ;
485 paddw xmm1, xmm4 ;
486 psraw xmm1, xmm_filter_shift ;
487
488 movdqa xmm3, xmm5 ;
489 movdqa xmm5, xmm1 ;
490
491 pmullw xmm3, [rdx] ;
492 pmullw xmm1, [rdx+16] ;
493 paddw xmm1, xmm3 ;
494 paddw xmm1, xmm4 ;
495 psraw xmm1, xmm_filter_shift ;
496
497 movq xmm3, QWORD PTR [rdi] ;
498 punpcklbw xmm3, xmm0 ;
499
500 psubw xmm1, xmm3 ;
501 paddw xmm6, xmm1 ;
502
503 pmaddwd xmm1, xmm1 ;
504 paddd xmm7, xmm1 ;
505
506 lea rsi, [rsi + rbx] ;ref_pixels_per_line
507 %if ABI_IS_32BIT
508 add rdi, dword ptr arg(3) ;src_pixels_per_line
509 %else
510 lea rdi, [rdi + r9]
511 %endif
512
513 sub rcx, 1 ;
514 jnz filter_block2d_bil_var_sse2_loop ;
515
516 jmp filter_block2d_bil_variance
517
518 filter_block2d_bil_var_sse2_sp_only:
519 movsxd rdx, dword ptr arg(6) ; yoffset
520
521 cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0
522 je filter_block2d_bil_var_sse2_full_pixel
523
524 shl rdx, 5
525 lea rdx, [rdx + rcx] ; VFilter
526
527 mov rsi, arg(0) ;ref_ptr
528 mov rdi, arg(2) ;src_ptr
529 movsxd rcx, dword ptr arg(4) ;Height
530 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
531
532 pxor xmm0, xmm0 ;
533 movq xmm1, QWORD PTR [rsi] ;
534 punpcklbw xmm1, xmm0 ;
535
536 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
537 lea rsi, [rsi + rax]
538
539 filter_block2d_bil_sp_only_loop:
540 movq xmm3, QWORD PTR [rsi] ;
541 punpcklbw xmm3, xmm0 ;
542 movdqa xmm5, xmm3
543
544 pmullw xmm1, [rdx] ;
545 pmullw xmm3, [rdx+16] ;
546 paddw xmm1, xmm3 ;
547 paddw xmm1, xmm4 ;
548 psraw xmm1, xmm_filter_shift ;
549
550 movq xmm3, QWORD PTR [rdi] ;
551 punpcklbw xmm3, xmm0 ;
552
553 psubw xmm1, xmm3 ;
554 paddw xmm6, xmm1 ;
555
556 pmaddwd xmm1, xmm1 ;
557 paddd xmm7, xmm1 ;
558
559 movdqa xmm1, xmm5 ;
560 lea rsi, [rsi + rax] ;ref_pixels_per_line
561 lea rdi, [rdi + rbx] ;src_pixels_per_line
562
563 sub rcx, 1 ;
564 jnz filter_block2d_bil_sp_only_loop ;
565
566 jmp filter_block2d_bil_variance
567
568 filter_block2d_bil_var_sse2_full_pixel:
569 mov rsi, arg(0) ;ref_ptr
570 mov rdi, arg(2) ;src_ptr
571 movsxd rcx, dword ptr arg(4) ;Height
572 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
573 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
574 pxor xmm0, xmm0 ;
575
576 filter_block2d_bil_full_pixel_loop:
577 movq xmm1, QWORD PTR [rsi] ;
578 punpcklbw xmm1, xmm0 ;
579
580 movq xmm2, QWORD PTR [rdi] ;
581 punpcklbw xmm2, xmm0 ;
582
583 psubw xmm1, xmm2 ;
584 paddw xmm6, xmm1 ;
585
586 pmaddwd xmm1, xmm1 ;
587 paddd xmm7, xmm1 ;
588
589 lea rsi, [rsi + rax] ;ref_pixels_per_line
590 lea rdi, [rdi + rbx] ;src_pixels_per_line
591
592 sub rcx, 1 ;
593 jnz filter_block2d_bil_full_pixel_loop ;
594
595 jmp filter_block2d_bil_variance
596
597 filter_block2d_bil_var_sse2_fp_only:
598 mov rsi, arg(0) ;ref_ptr
599 mov rdi, arg(2) ;src_ptr
600 movsxd rcx, dword ptr arg(4) ;Height
601 movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line
602
603 pxor xmm0, xmm0 ;
604 movsxd rbx, dword ptr arg(3) ;src_pixels_per_line
605
606 filter_block2d_bil_fp_only_loop:
607 movq xmm1, QWORD PTR [rsi] ;
608 movq xmm3, QWORD PTR [rsi+1] ;
609
610 punpcklbw xmm1, xmm0 ;
611 pmullw xmm1, [rax] ;
612 punpcklbw xmm3, xmm0 ;
613 pmullw xmm3, [rax+16] ;
614
615 paddw xmm1, xmm3 ;
616 paddw xmm1, xmm4 ;
617 psraw xmm1, xmm_filter_shift ;
618
619 movq xmm3, QWORD PTR [rdi] ;
620 punpcklbw xmm3, xmm0 ;
621
622 psubw xmm1, xmm3 ;
623 paddw xmm6, xmm1 ;
624
625 pmaddwd xmm1, xmm1 ;
626 paddd xmm7, xmm1 ;
627 lea rsi, [rsi + rdx]
628 lea rdi, [rdi + rbx] ;src_pixels_per_line
629
630 sub rcx, 1 ;
631 jnz filter_block2d_bil_fp_only_loop ;
632
633 jmp filter_block2d_bil_variance
634
635 filter_block2d_bil_variance:
636 movdq2q mm6, xmm6 ;
637 movdq2q mm7, xmm7 ;
638
639 psrldq xmm6, 8
640 psrldq xmm7, 8
641
642 movdq2q mm2, xmm6
643 movdq2q mm3, xmm7
644
645 paddw mm6, mm2
646 paddd mm7, mm3
647
648 pxor mm3, mm3 ;
649 pxor mm2, mm2 ;
650
651 punpcklwd mm2, mm6 ;
652 punpckhwd mm3, mm6 ;
653
654 paddd mm2, mm3 ;
655 movq mm6, mm2 ;
656
657 psrlq mm6, 32 ;
658 paddd mm2, mm6 ;
659
660 psrad mm2, 16 ;
661 movq mm4, mm7 ;
662
663 psrlq mm4, 32 ;
664 paddd mm4, mm7 ;
665
666 mov rsi, arg(7) ; sum
667 mov rdi, arg(8) ; sumsquared
668
669 movd [rsi], mm2 ; xsum
670 movd [rdi], mm4 ; xxsum
671
672 ; begin epilog
673 pop rbx
674 pop rdi
675 pop rsi
676 RESTORE_GOT
677 RESTORE_XMM
678 UNSHADOW_ARGS
679 pop rbp
680 ret
681
682
683 ;void vp8_half_horiz_vert_variance8x_h_sse2
684 ;(
685 ; unsigned char *ref_ptr,
686 ; int ref_pixels_per_line,
687 ; unsigned char *src_ptr,
688 ; int src_pixels_per_line,
689 ; unsigned int Height,
690 ; int *sum,
691 ; unsigned int *sumsquared
692 ;)
693 global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE
694 sym(vp8_half_horiz_vert_variance8x_h_sse2):
695 push rbp
696 mov rbp, rsp
697 SHADOW_ARGS_TO_STACK 7
698 SAVE_XMM 7
699 GET_GOT rbx
700 push rsi
701 push rdi
702 ; end prolog
703
704 %if ABI_IS_32BIT=0
705 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
706 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
707 %endif
708
709 pxor xmm6, xmm6 ; error accumulator
710 pxor xmm7, xmm7 ; sse eaccumulator
711 mov rsi, arg(0) ;ref_ptr ;
712
713 mov rdi, arg(2) ;src_ptr ;
714 movsxd rcx, dword ptr arg(4) ;Height ;
715 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
716
717 pxor xmm0, xmm0 ;
718
719 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
720 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
721 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
722
723 %if ABI_IS_32BIT
724 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
725 %else
726 add rsi, r8
727 %endif
728
729 vp8_half_horiz_vert_variance8x_h_1:
730
731 movq xmm1, QWORD PTR [rsi] ;
732 movq xmm2, QWORD PTR [rsi+1] ;
733 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
734
735 pavgb xmm5, xmm1 ; xmm = vertical average of the above
736 punpcklbw xmm5, xmm0 ; xmm5 = words of above
737
738 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
739 punpcklbw xmm3, xmm0 ; xmm3 = words of above
740
741 psubw xmm5, xmm3 ; xmm5 -= xmm3
742 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
743 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
744 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
745
746 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
747
748 %if ABI_IS_32BIT
749 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
750 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
751 %else
752 add rsi, r8
753 add rdi, r9
754 %endif
755
756 sub rcx, 1 ;
757 jnz vp8_half_horiz_vert_variance8x_h_1 ;
758
759 movdq2q mm6, xmm6 ;
760 movdq2q mm7, xmm7 ;
761
762 psrldq xmm6, 8
763 psrldq xmm7, 8
764
765 movdq2q mm2, xmm6
766 movdq2q mm3, xmm7
767
768 paddw mm6, mm2
769 paddd mm7, mm3
770
771 pxor mm3, mm3 ;
772 pxor mm2, mm2 ;
773
774 punpcklwd mm2, mm6 ;
775 punpckhwd mm3, mm6 ;
776
777 paddd mm2, mm3 ;
778 movq mm6, mm2 ;
779
780 psrlq mm6, 32 ;
781 paddd mm2, mm6 ;
782
783 psrad mm2, 16 ;
784 movq mm4, mm7 ;
785
786 psrlq mm4, 32 ;
787 paddd mm4, mm7 ;
788
789 mov rsi, arg(5) ; sum
790 mov rdi, arg(6) ; sumsquared
791
792 movd [rsi], mm2 ;
793 movd [rdi], mm4 ;
794
795
796 ; begin epilog
797 pop rdi
798 pop rsi
799 RESTORE_GOT
800 RESTORE_XMM
801 UNSHADOW_ARGS
802 pop rbp
803 ret
804
805 ;void vp8_half_horiz_vert_variance16x_h_sse2
806 ;(
807 ; unsigned char *ref_ptr,
808 ; int ref_pixels_per_line,
809 ; unsigned char *src_ptr,
810 ; int src_pixels_per_line,
811 ; unsigned int Height,
812 ; int *sum,
813 ; unsigned int *sumsquared
814 ;)
815 global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE
816 sym(vp8_half_horiz_vert_variance16x_h_sse2):
817 push rbp
818 mov rbp, rsp
819 SHADOW_ARGS_TO_STACK 7
820 SAVE_XMM 7
821 GET_GOT rbx
822 push rsi
823 push rdi
824 ; end prolog
825
826 pxor xmm6, xmm6 ; error accumulator
827 pxor xmm7, xmm7 ; sse eaccumulator
828 mov rsi, arg(0) ;ref_ptr ;
829
830 mov rdi, arg(2) ;src_ptr ;
831 movsxd rcx, dword ptr arg(4) ;Height ;
832 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
833 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
834
835 pxor xmm0, xmm0 ;
836
837 movdqu xmm5, XMMWORD PTR [rsi]
838 movdqu xmm3, XMMWORD PTR [rsi+1]
839 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
840
841 lea rsi, [rsi + rax]
842
843 vp8_half_horiz_vert_variance16x_h_1:
844 movdqu xmm1, XMMWORD PTR [rsi] ;
845 movdqu xmm2, XMMWORD PTR [rsi+1] ;
846 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
847
848 pavgb xmm5, xmm1 ; xmm = vertical average of the above
849
850 movdqa xmm4, xmm5
851 punpcklbw xmm5, xmm0 ; xmm5 = words of above
852 punpckhbw xmm4, xmm0
853
854 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
855 punpcklbw xmm3, xmm0 ; xmm3 = words of above
856 psubw xmm5, xmm3 ; xmm5 -= xmm3
857
858 movq xmm3, QWORD PTR [rdi+8]
859 punpcklbw xmm3, xmm0
860 psubw xmm4, xmm3
861
862 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
863 paddw xmm6, xmm4
864 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
865 pmaddwd xmm4, xmm4
866 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
867 paddd xmm7, xmm4
868
869 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
870
871 lea rsi, [rsi + rax]
872 lea rdi, [rdi + rdx]
873
874 sub rcx, 1 ;
875 jnz vp8_half_horiz_vert_variance16x_h_1 ;
876
877 pxor xmm1, xmm1
878 pxor xmm5, xmm5
879
880 punpcklwd xmm0, xmm6
881 punpckhwd xmm1, xmm6
882 psrad xmm0, 16
883 psrad xmm1, 16
884 paddd xmm0, xmm1
885 movdqa xmm1, xmm0
886
887 movdqa xmm6, xmm7
888 punpckldq xmm6, xmm5
889 punpckhdq xmm7, xmm5
890 paddd xmm6, xmm7
891
892 punpckldq xmm0, xmm5
893 punpckhdq xmm1, xmm5
894 paddd xmm0, xmm1
895
896 movdqa xmm7, xmm6
897 movdqa xmm1, xmm0
898
899 psrldq xmm7, 8
900 psrldq xmm1, 8
901
902 paddd xmm6, xmm7
903 paddd xmm0, xmm1
904
905 mov rsi, arg(5) ;[Sum]
906 mov rdi, arg(6) ;[SSE]
907
908 movd [rsi], xmm0
909 movd [rdi], xmm6
910
911 ; begin epilog
912 pop rdi
913 pop rsi
914 RESTORE_GOT
915 RESTORE_XMM
916 UNSHADOW_ARGS
917 pop rbp
918 ret
919
920
921 ;void vp8_half_vert_variance8x_h_sse2
922 ;(
923 ; unsigned char *ref_ptr,
924 ; int ref_pixels_per_line,
925 ; unsigned char *src_ptr,
926 ; int src_pixels_per_line,
927 ; unsigned int Height,
928 ; int *sum,
929 ; unsigned int *sumsquared
930 ;)
931 global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE
932 sym(vp8_half_vert_variance8x_h_sse2):
933 push rbp
934 mov rbp, rsp
935 SHADOW_ARGS_TO_STACK 7
936 SAVE_XMM 7
937 GET_GOT rbx
938 push rsi
939 push rdi
940 ; end prolog
941
942 %if ABI_IS_32BIT=0
943 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
944 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
945 %endif
946
947 pxor xmm6, xmm6 ; error accumulator
948 pxor xmm7, xmm7 ; sse eaccumulator
949 mov rsi, arg(0) ;ref_ptr ;
950
951 mov rdi, arg(2) ;src_ptr ;
952 movsxd rcx, dword ptr arg(4) ;Height ;
953 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
954
955 pxor xmm0, xmm0 ;
956 vp8_half_vert_variance8x_h_1:
957 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
958 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
959
960 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
961 punpcklbw xmm5, xmm0 ; xmm5 = words of above
962
963 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
964 punpcklbw xmm3, xmm0 ; xmm3 = words of above
965
966 psubw xmm5, xmm3 ; xmm5 -= xmm3
967 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
968 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
969 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
970
971 %if ABI_IS_32BIT
972 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
973 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
974 %else
975 add rsi, r8
976 add rdi, r9
977 %endif
978
979 sub rcx, 1 ;
980 jnz vp8_half_vert_variance8x_h_1 ;
981
982 movdq2q mm6, xmm6 ;
983 movdq2q mm7, xmm7 ;
984
985 psrldq xmm6, 8
986 psrldq xmm7, 8
987
988 movdq2q mm2, xmm6
989 movdq2q mm3, xmm7
990
991 paddw mm6, mm2
992 paddd mm7, mm3
993
994 pxor mm3, mm3 ;
995 pxor mm2, mm2 ;
996
997 punpcklwd mm2, mm6 ;
998 punpckhwd mm3, mm6 ;
999
1000 paddd mm2, mm3 ;
1001 movq mm6, mm2 ;
1002
1003 psrlq mm6, 32 ;
1004 paddd mm2, mm6 ;
1005
1006 psrad mm2, 16 ;
1007 movq mm4, mm7 ;
1008
1009 psrlq mm4, 32 ;
1010 paddd mm4, mm7 ;
1011
1012 mov rsi, arg(5) ; sum
1013 mov rdi, arg(6) ; sumsquared
1014
1015 movd [rsi], mm2 ;
1016 movd [rdi], mm4 ;
1017
1018
1019 ; begin epilog
1020 pop rdi
1021 pop rsi
1022 RESTORE_GOT
1023 RESTORE_XMM
1024 UNSHADOW_ARGS
1025 pop rbp
1026 ret
1027
1028 ;void vp8_half_vert_variance16x_h_sse2
1029 ;(
1030 ; unsigned char *ref_ptr,
1031 ; int ref_pixels_per_line,
1032 ; unsigned char *src_ptr,
1033 ; int src_pixels_per_line,
1034 ; unsigned int Height,
1035 ; int *sum,
1036 ; unsigned int *sumsquared
1037 ;)
1038 global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE
1039 sym(vp8_half_vert_variance16x_h_sse2):
1040 push rbp
1041 mov rbp, rsp
1042 SHADOW_ARGS_TO_STACK 7
1043 SAVE_XMM 7
1044 GET_GOT rbx
1045 push rsi
1046 push rdi
1047 ; end prolog
1048
1049 pxor xmm6, xmm6 ; error accumulator
1050 pxor xmm7, xmm7 ; sse eaccumulator
1051 mov rsi, arg(0) ;ref_ptr
1052
1053 mov rdi, arg(2) ;src_ptr
1054 movsxd rcx, dword ptr arg(4) ;Height
1055 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
1056 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
1057
1058 movdqu xmm5, XMMWORD PTR [rsi]
1059 lea rsi, [rsi + rax ]
1060 pxor xmm0, xmm0
1061
1062 vp8_half_vert_variance16x_h_1:
1063 movdqu xmm3, XMMWORD PTR [rsi]
1064
1065 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
1066 movdqa xmm4, xmm5
1067 punpcklbw xmm5, xmm0
1068 punpckhbw xmm4, xmm0
1069
1070 movq xmm2, QWORD PTR [rdi]
1071 punpcklbw xmm2, xmm0
1072 psubw xmm5, xmm2
1073 movq xmm2, QWORD PTR [rdi+8]
1074 punpcklbw xmm2, xmm0
1075 psubw xmm4, xmm2
1076
1077 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
1078 paddw xmm6, xmm4
1079 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1080 pmaddwd xmm4, xmm4
1081 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
1082 paddd xmm7, xmm4
1083
1084 movdqa xmm5, xmm3
1085
1086 lea rsi, [rsi + rax]
1087 lea rdi, [rdi + rdx]
1088
1089 sub rcx, 1
1090 jnz vp8_half_vert_variance16x_h_1
1091
1092 pxor xmm1, xmm1
1093 pxor xmm5, xmm5
1094
1095 punpcklwd xmm0, xmm6
1096 punpckhwd xmm1, xmm6
1097 psrad xmm0, 16
1098 psrad xmm1, 16
1099 paddd xmm0, xmm1
1100 movdqa xmm1, xmm0
1101
1102 movdqa xmm6, xmm7
1103 punpckldq xmm6, xmm5
1104 punpckhdq xmm7, xmm5
1105 paddd xmm6, xmm7
1106
1107 punpckldq xmm0, xmm5
1108 punpckhdq xmm1, xmm5
1109 paddd xmm0, xmm1
1110
1111 movdqa xmm7, xmm6
1112 movdqa xmm1, xmm0
1113
1114 psrldq xmm7, 8
1115 psrldq xmm1, 8
1116
1117 paddd xmm6, xmm7
1118 paddd xmm0, xmm1
1119
1120 mov rsi, arg(5) ;[Sum]
1121 mov rdi, arg(6) ;[SSE]
1122
1123 movd [rsi], xmm0
1124 movd [rdi], xmm6
1125
1126 ; begin epilog
1127 pop rdi
1128 pop rsi
1129 RESTORE_GOT
1130 RESTORE_XMM
1131 UNSHADOW_ARGS
1132 pop rbp
1133 ret
1134
1135
1136 ;void vp8_half_horiz_variance8x_h_sse2
1137 ;(
1138 ; unsigned char *ref_ptr,
1139 ; int ref_pixels_per_line,
1140 ; unsigned char *src_ptr,
1141 ; int src_pixels_per_line,
1142 ; unsigned int Height,
1143 ; int *sum,
1144 ; unsigned int *sumsquared
1145 ;)
1146 global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE
1147 sym(vp8_half_horiz_variance8x_h_sse2):
1148 push rbp
1149 mov rbp, rsp
1150 SHADOW_ARGS_TO_STACK 7
1151 SAVE_XMM 7
1152 GET_GOT rbx
1153 push rsi
1154 push rdi
1155 ; end prolog
1156
1157 %if ABI_IS_32BIT=0
1158 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
1159 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
1160 %endif
1161
1162 pxor xmm6, xmm6 ; error accumulator
1163 pxor xmm7, xmm7 ; sse eaccumulator
1164 mov rsi, arg(0) ;ref_ptr ;
1165
1166 mov rdi, arg(2) ;src_ptr ;
1167 movsxd rcx, dword ptr arg(4) ;Height ;
1168
1169 pxor xmm0, xmm0 ;
1170 vp8_half_horiz_variance8x_h_1:
1171 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
1172 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
1173
1174 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
1175 punpcklbw xmm5, xmm0 ; xmm5 = words of above
1176
1177 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
1178 punpcklbw xmm3, xmm0 ; xmm3 = words of above
1179
1180 psubw xmm5, xmm3 ; xmm5 -= xmm3
1181 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
1182 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1183 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
1184
1185 %if ABI_IS_32BIT
1186 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
1187 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
1188 %else
1189 add rsi, r8
1190 add rdi, r9
1191 %endif
1192 sub rcx, 1 ;
1193 jnz vp8_half_horiz_variance8x_h_1 ;
1194
1195 movdq2q mm6, xmm6 ;
1196 movdq2q mm7, xmm7 ;
1197
1198 psrldq xmm6, 8
1199 psrldq xmm7, 8
1200
1201 movdq2q mm2, xmm6
1202 movdq2q mm3, xmm7
1203
1204 paddw mm6, mm2
1205 paddd mm7, mm3
1206
1207 pxor mm3, mm3 ;
1208 pxor mm2, mm2 ;
1209
1210 punpcklwd mm2, mm6 ;
1211 punpckhwd mm3, mm6 ;
1212
1213 paddd mm2, mm3 ;
1214 movq mm6, mm2 ;
1215
1216 psrlq mm6, 32 ;
1217 paddd mm2, mm6 ;
1218
1219 psrad mm2, 16 ;
1220 movq mm4, mm7 ;
1221
1222 psrlq mm4, 32 ;
1223 paddd mm4, mm7 ;
1224
1225 mov rsi, arg(5) ; sum
1226 mov rdi, arg(6) ; sumsquared
1227
1228 movd [rsi], mm2 ;
1229 movd [rdi], mm4 ;
1230
1231
1232 ; begin epilog
1233 pop rdi
1234 pop rsi
1235 RESTORE_GOT
1236 RESTORE_XMM
1237 UNSHADOW_ARGS
1238 pop rbp
1239 ret
1240
1241 ;void vp8_half_horiz_variance16x_h_sse2
1242 ;(
1243 ; unsigned char *ref_ptr,
1244 ; int ref_pixels_per_line,
1245 ; unsigned char *src_ptr,
1246 ; int src_pixels_per_line,
1247 ; unsigned int Height,
1248 ; int *sum,
1249 ; unsigned int *sumsquared
1250 ;)
1251 global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE
1252 sym(vp8_half_horiz_variance16x_h_sse2):
1253 push rbp
1254 mov rbp, rsp
1255 SHADOW_ARGS_TO_STACK 7
1256 SAVE_XMM 7
1257 GET_GOT rbx
1258 push rsi
1259 push rdi
1260 ; end prolog
1261
1262 pxor xmm6, xmm6 ; error accumulator
1263 pxor xmm7, xmm7 ; sse eaccumulator
1264 mov rsi, arg(0) ;ref_ptr ;
1265
1266 mov rdi, arg(2) ;src_ptr ;
1267 movsxd rcx, dword ptr arg(4) ;Height ;
1268 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
1269 movsxd rdx, dword ptr arg(3) ;src_pixels_per_line
1270
1271 pxor xmm0, xmm0 ;
1272
1273 vp8_half_horiz_variance16x_h_1:
1274 movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15
1275 movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16
1276
1277 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
1278 movdqa xmm1, xmm5
1279 punpcklbw xmm5, xmm0 ; xmm5 = words of above
1280 punpckhbw xmm1, xmm0
1281
1282 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7
1283 punpcklbw xmm3, xmm0 ; xmm3 = words of above
1284 movq xmm2, QWORD PTR [rdi+8]
1285 punpcklbw xmm2, xmm0
1286
1287 psubw xmm5, xmm3 ; xmm5 -= xmm3
1288 psubw xmm1, xmm2
1289 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
1290 paddw xmm6, xmm1
1291 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
1292 pmaddwd xmm1, xmm1
1293 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
1294 paddd xmm7, xmm1
1295
1296 lea rsi, [rsi + rax]
1297 lea rdi, [rdi + rdx]
1298
1299 sub rcx, 1 ;
1300 jnz vp8_half_horiz_variance16x_h_1 ;
1301
1302 pxor xmm1, xmm1
1303 pxor xmm5, xmm5
1304
1305 punpcklwd xmm0, xmm6
1306 punpckhwd xmm1, xmm6
1307 psrad xmm0, 16
1308 psrad xmm1, 16
1309 paddd xmm0, xmm1
1310 movdqa xmm1, xmm0
1311
1312 movdqa xmm6, xmm7
1313 punpckldq xmm6, xmm5
1314 punpckhdq xmm7, xmm5
1315 paddd xmm6, xmm7
1316
1317 punpckldq xmm0, xmm5
1318 punpckhdq xmm1, xmm5
1319 paddd xmm0, xmm1
1320
1321 movdqa xmm7, xmm6
1322 movdqa xmm1, xmm0
1323
1324 psrldq xmm7, 8
1325 psrldq xmm1, 8
1326
1327 paddd xmm6, xmm7
1328 paddd xmm0, xmm1
1329
1330 mov rsi, arg(5) ;[Sum]
1331 mov rdi, arg(6) ;[SSE]
1332
1333 movd [rsi], xmm0
1334 movd [rdi], xmm6
1335
1336 ; begin epilog
1337 pop rdi
1338 pop rsi
1339 RESTORE_GOT
1340 RESTORE_XMM
1341 UNSHADOW_ARGS
1342 pop rbp
1343 ret
1344
1345 SECTION_RODATA
1346 ; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64};
1347 align 16
1348 xmm_bi_rd:
1349 times 8 dw 64
1350 align 16
1351 vp8_bilinear_filters_sse2:
1352 dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0
1353 dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16
1354 dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32
1355 dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48
1356 dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64
1357 dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80
1358 dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96
1359 dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112

mercurial