media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm

changeset 0
6474c204b198
equal deleted inserted replaced
-1:000000000000 0:4ae75861f3fc
1 ;
2 ; Copyright (c) 2010 The WebM project authors. All Rights Reserved.
3 ;
4 ; Use of this source code is governed by a BSD-style license
5 ; that can be found in the LICENSE file in the root of the source
6 ; tree. An additional intellectual property rights grant can be found
7 ; in the file PATENTS. All contributing project authors may
8 ; be found in the AUTHORS file in the root of the source tree.
9 ;
10
11
12 %include "vpx_ports/x86_abi_support.asm"
13
14 ;unsigned int vp9_get_mb_ss_sse2
15 ;(
16 ; short *src_ptr
17 ;)
18 global sym(vp9_get_mb_ss_sse2) PRIVATE
19 sym(vp9_get_mb_ss_sse2):
20 push rbp
21 mov rbp, rsp
22 SHADOW_ARGS_TO_STACK 1
23 GET_GOT rbx
24 push rsi
25 push rdi
26 sub rsp, 16
27 ; end prolog
28
29
30 mov rax, arg(0) ;[src_ptr]
31 mov rcx, 8
32 pxor xmm4, xmm4
33
34 .NEXTROW:
35 movdqa xmm0, [rax]
36 movdqa xmm1, [rax+16]
37 movdqa xmm2, [rax+32]
38 movdqa xmm3, [rax+48]
39 pmaddwd xmm0, xmm0
40 pmaddwd xmm1, xmm1
41 pmaddwd xmm2, xmm2
42 pmaddwd xmm3, xmm3
43
44 paddd xmm0, xmm1
45 paddd xmm2, xmm3
46 paddd xmm4, xmm0
47 paddd xmm4, xmm2
48
49 add rax, 0x40
50 dec rcx
51 ja .NEXTROW
52
53 movdqa xmm3,xmm4
54 psrldq xmm4,8
55 paddd xmm4,xmm3
56 movdqa xmm3,xmm4
57 psrldq xmm4,4
58 paddd xmm4,xmm3
59 movq rax,xmm4
60
61
62 ; begin epilog
63 add rsp, 16
64 pop rdi
65 pop rsi
66 RESTORE_GOT
67 UNSHADOW_ARGS
68 pop rbp
69 ret
70
71
72 ;unsigned int vp9_get16x16var_sse2
73 ;(
74 ; unsigned char * src_ptr,
75 ; int source_stride,
76 ; unsigned char * ref_ptr,
77 ; int recon_stride,
78 ; unsigned int * SSE,
79 ; int * Sum
80 ;)
81 global sym(vp9_get16x16var_sse2) PRIVATE
82 sym(vp9_get16x16var_sse2):
83 push rbp
84 mov rbp, rsp
85 SHADOW_ARGS_TO_STACK 6
86 SAVE_XMM 7
87 push rbx
88 push rsi
89 push rdi
90 ; end prolog
91
92 mov rsi, arg(0) ;[src_ptr]
93 mov rdi, arg(2) ;[ref_ptr]
94
95 movsxd rax, DWORD PTR arg(1) ;[source_stride]
96 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
97
98 ; Prefetch data
99 lea rcx, [rax+rax*2]
100 prefetcht0 [rsi]
101 prefetcht0 [rsi+rax]
102 prefetcht0 [rsi+rax*2]
103 prefetcht0 [rsi+rcx]
104 lea rbx, [rsi+rax*4]
105 prefetcht0 [rbx]
106 prefetcht0 [rbx+rax]
107 prefetcht0 [rbx+rax*2]
108 prefetcht0 [rbx+rcx]
109
110 lea rcx, [rdx+rdx*2]
111 prefetcht0 [rdi]
112 prefetcht0 [rdi+rdx]
113 prefetcht0 [rdi+rdx*2]
114 prefetcht0 [rdi+rcx]
115 lea rbx, [rdi+rdx*4]
116 prefetcht0 [rbx]
117 prefetcht0 [rbx+rdx]
118 prefetcht0 [rbx+rdx*2]
119 prefetcht0 [rbx+rcx]
120
121 pxor xmm0, xmm0 ; clear xmm0 for unpack
122 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
123
124 pxor xmm6, xmm6 ; clear xmm6 for accumulating sse
125 mov rcx, 16
126
127 .var16loop:
128 movdqu xmm1, XMMWORD PTR [rsi]
129 movdqu xmm2, XMMWORD PTR [rdi]
130
131 prefetcht0 [rsi+rax*8]
132 prefetcht0 [rdi+rdx*8]
133
134 movdqa xmm3, xmm1
135 movdqa xmm4, xmm2
136
137
138 punpcklbw xmm1, xmm0
139 punpckhbw xmm3, xmm0
140
141 punpcklbw xmm2, xmm0
142 punpckhbw xmm4, xmm0
143
144
145 psubw xmm1, xmm2
146 psubw xmm3, xmm4
147
148 paddw xmm7, xmm1
149 pmaddwd xmm1, xmm1
150
151 paddw xmm7, xmm3
152 pmaddwd xmm3, xmm3
153
154 paddd xmm6, xmm1
155 paddd xmm6, xmm3
156
157 add rsi, rax
158 add rdi, rdx
159
160 sub rcx, 1
161 jnz .var16loop
162
163
164 movdqa xmm1, xmm6
165 pxor xmm6, xmm6
166
167 pxor xmm5, xmm5
168 punpcklwd xmm6, xmm7
169
170 punpckhwd xmm5, xmm7
171 psrad xmm5, 16
172
173 psrad xmm6, 16
174 paddd xmm6, xmm5
175
176 movdqa xmm2, xmm1
177 punpckldq xmm1, xmm0
178
179 punpckhdq xmm2, xmm0
180 movdqa xmm7, xmm6
181
182 paddd xmm1, xmm2
183 punpckldq xmm6, xmm0
184
185 punpckhdq xmm7, xmm0
186 paddd xmm6, xmm7
187
188 movdqa xmm2, xmm1
189 movdqa xmm7, xmm6
190
191 psrldq xmm1, 8
192 psrldq xmm6, 8
193
194 paddd xmm7, xmm6
195 paddd xmm1, xmm2
196
197 mov rax, arg(5) ;[Sum]
198 mov rdi, arg(4) ;[SSE]
199
200 movd DWORD PTR [rax], xmm7
201 movd DWORD PTR [rdi], xmm1
202
203
204 ; begin epilog
205 pop rdi
206 pop rsi
207 pop rbx
208 RESTORE_XMM
209 UNSHADOW_ARGS
210 pop rbp
211 ret
212
213
214
215
216 ;unsigned int vp9_get8x8var_sse2
217 ;(
218 ; unsigned char * src_ptr,
219 ; int source_stride,
220 ; unsigned char * ref_ptr,
221 ; int recon_stride,
222 ; unsigned int * SSE,
223 ; int * Sum
224 ;)
225 global sym(vp9_get8x8var_sse2) PRIVATE
226 sym(vp9_get8x8var_sse2):
227 push rbp
228 mov rbp, rsp
229 SHADOW_ARGS_TO_STACK 6
230 SAVE_XMM 7
231 GET_GOT rbx
232 push rsi
233 push rdi
234 sub rsp, 16
235 ; end prolog
236
237 mov rsi, arg(0) ;[src_ptr]
238 mov rdi, arg(2) ;[ref_ptr]
239
240 movsxd rax, DWORD PTR arg(1) ;[source_stride]
241 movsxd rdx, DWORD PTR arg(3) ;[recon_stride]
242
243 pxor xmm0, xmm0 ; clear xmm0 for unpack
244 pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs
245
246 movq xmm1, QWORD PTR [rsi]
247 movq xmm2, QWORD PTR [rdi]
248
249 punpcklbw xmm1, xmm0
250 punpcklbw xmm2, xmm0
251
252 psubsw xmm1, xmm2
253 paddw xmm7, xmm1
254
255 pmaddwd xmm1, xmm1
256
257 movq xmm2, QWORD PTR[rsi + rax]
258 movq xmm3, QWORD PTR[rdi + rdx]
259
260 punpcklbw xmm2, xmm0
261 punpcklbw xmm3, xmm0
262
263 psubsw xmm2, xmm3
264 paddw xmm7, xmm2
265
266 pmaddwd xmm2, xmm2
267 paddd xmm1, xmm2
268
269
270 movq xmm2, QWORD PTR[rsi + rax * 2]
271 movq xmm3, QWORD PTR[rdi + rdx * 2]
272
273 punpcklbw xmm2, xmm0
274 punpcklbw xmm3, xmm0
275
276 psubsw xmm2, xmm3
277 paddw xmm7, xmm2
278
279 pmaddwd xmm2, xmm2
280 paddd xmm1, xmm2
281
282
283 lea rsi, [rsi + rax * 2]
284 lea rdi, [rdi + rdx * 2]
285 movq xmm2, QWORD PTR[rsi + rax]
286 movq xmm3, QWORD PTR[rdi + rdx]
287
288 punpcklbw xmm2, xmm0
289 punpcklbw xmm3, xmm0
290
291 psubsw xmm2, xmm3
292 paddw xmm7, xmm2
293
294 pmaddwd xmm2, xmm2
295 paddd xmm1, xmm2
296
297 movq xmm2, QWORD PTR[rsi + rax *2]
298 movq xmm3, QWORD PTR[rdi + rdx *2]
299
300 punpcklbw xmm2, xmm0
301 punpcklbw xmm3, xmm0
302
303 psubsw xmm2, xmm3
304 paddw xmm7, xmm2
305
306 pmaddwd xmm2, xmm2
307 paddd xmm1, xmm2
308
309
310 lea rsi, [rsi + rax * 2]
311 lea rdi, [rdi + rdx * 2]
312
313
314 movq xmm2, QWORD PTR[rsi + rax]
315 movq xmm3, QWORD PTR[rdi + rdx]
316
317 punpcklbw xmm2, xmm0
318 punpcklbw xmm3, xmm0
319
320 psubsw xmm2, xmm3
321 paddw xmm7, xmm2
322
323 pmaddwd xmm2, xmm2
324 paddd xmm1, xmm2
325
326 movq xmm2, QWORD PTR[rsi + rax *2]
327 movq xmm3, QWORD PTR[rdi + rdx *2]
328
329 punpcklbw xmm2, xmm0
330 punpcklbw xmm3, xmm0
331
332 psubsw xmm2, xmm3
333 paddw xmm7, xmm2
334
335 pmaddwd xmm2, xmm2
336 paddd xmm1, xmm2
337
338
339 lea rsi, [rsi + rax * 2]
340 lea rdi, [rdi + rdx * 2]
341
342 movq xmm2, QWORD PTR[rsi + rax]
343 movq xmm3, QWORD PTR[rdi + rdx]
344
345 punpcklbw xmm2, xmm0
346 punpcklbw xmm3, xmm0
347
348 psubsw xmm2, xmm3
349 paddw xmm7, xmm2
350
351 pmaddwd xmm2, xmm2
352 paddd xmm1, xmm2
353
354
355 movdqa xmm6, xmm7
356 punpcklwd xmm6, xmm0
357
358 punpckhwd xmm7, xmm0
359 movdqa xmm2, xmm1
360
361 paddw xmm6, xmm7
362 punpckldq xmm1, xmm0
363
364 punpckhdq xmm2, xmm0
365 movdqa xmm7, xmm6
366
367 paddd xmm1, xmm2
368 punpckldq xmm6, xmm0
369
370 punpckhdq xmm7, xmm0
371 paddw xmm6, xmm7
372
373 movdqa xmm2, xmm1
374 movdqa xmm7, xmm6
375
376 psrldq xmm1, 8
377 psrldq xmm6, 8
378
379 paddw xmm7, xmm6
380 paddd xmm1, xmm2
381
382 mov rax, arg(5) ;[Sum]
383 mov rdi, arg(4) ;[SSE]
384
385 movq rdx, xmm7
386 movsx rcx, dx
387
388 mov dword ptr [rax], ecx
389 movd DWORD PTR [rdi], xmm1
390
391 ; begin epilog
392 add rsp, 16
393 pop rdi
394 pop rsi
395 RESTORE_GOT
396 RESTORE_XMM
397 UNSHADOW_ARGS
398 pop rbp
399 ret
400
401 ;void vp9_half_horiz_vert_variance8x_h_sse2
402 ;(
403 ; unsigned char *ref_ptr,
404 ; int ref_pixels_per_line,
405 ; unsigned char *src_ptr,
406 ; int src_pixels_per_line,
407 ; unsigned int Height,
408 ; int *sum,
409 ; unsigned int *sumsquared
410 ;)
411 global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE
412 sym(vp9_half_horiz_vert_variance8x_h_sse2):
413 push rbp
414 mov rbp, rsp
415 SHADOW_ARGS_TO_STACK 7
416 SAVE_XMM 7
417 GET_GOT rbx
418 push rsi
419 push rdi
420 ; end prolog
421
422 %if ABI_IS_32BIT=0
423 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
424 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
425 %endif
426
427 pxor xmm6, xmm6 ; error accumulator
428 pxor xmm7, xmm7 ; sse eaccumulator
429 mov rsi, arg(0) ;ref_ptr ;
430
431 mov rdi, arg(2) ;src_ptr ;
432 movsxd rcx, dword ptr arg(4) ;Height ;
433 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
434
435 pxor xmm0, xmm0 ;
436
437 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
438 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
439 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1
440
441 %if ABI_IS_32BIT
442 add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source
443 %else
444 add rsi, r8
445 %endif
446
447 .half_horiz_vert_variance8x_h_1:
448
449 movq xmm1, QWORD PTR [rsi] ;
450 movq xmm2, QWORD PTR [rsi+1] ;
451 pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1
452
453 pavgb xmm5, xmm1 ; xmm = vertical average of the above
454 punpcklbw xmm5, xmm0 ; xmm5 = words of above
455
456 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
457 punpcklbw xmm3, xmm0 ; xmm3 = words of above
458
459 psubw xmm5, xmm3 ; xmm5 -= xmm3
460 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
461 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
462 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
463
464 movdqa xmm5, xmm1 ; save xmm1 for use on the next row
465
466 %if ABI_IS_32BIT
467 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
468 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
469 %else
470 add rsi, r8
471 add rdi, r9
472 %endif
473
474 sub rcx, 1 ;
475 jnz .half_horiz_vert_variance8x_h_1 ;
476
477 movdq2q mm6, xmm6 ;
478 movdq2q mm7, xmm7 ;
479
480 psrldq xmm6, 8
481 psrldq xmm7, 8
482
483 movdq2q mm2, xmm6
484 movdq2q mm3, xmm7
485
486 paddw mm6, mm2
487 paddd mm7, mm3
488
489 pxor mm3, mm3 ;
490 pxor mm2, mm2 ;
491
492 punpcklwd mm2, mm6 ;
493 punpckhwd mm3, mm6 ;
494
495 paddd mm2, mm3 ;
496 movq mm6, mm2 ;
497
498 psrlq mm6, 32 ;
499 paddd mm2, mm6 ;
500
501 psrad mm2, 16 ;
502 movq mm4, mm7 ;
503
504 psrlq mm4, 32 ;
505 paddd mm4, mm7 ;
506
507 mov rsi, arg(5) ; sum
508 mov rdi, arg(6) ; sumsquared
509
510 movd [rsi], mm2 ;
511 movd [rdi], mm4 ;
512
513
514 ; begin epilog
515 pop rdi
516 pop rsi
517 RESTORE_GOT
518 RESTORE_XMM
519 UNSHADOW_ARGS
520 pop rbp
521 ret
522
523 ;void vp9_half_vert_variance8x_h_sse2
524 ;(
525 ; unsigned char *ref_ptr,
526 ; int ref_pixels_per_line,
527 ; unsigned char *src_ptr,
528 ; int src_pixels_per_line,
529 ; unsigned int Height,
530 ; int *sum,
531 ; unsigned int *sumsquared
532 ;)
533 global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE
534 sym(vp9_half_vert_variance8x_h_sse2):
535 push rbp
536 mov rbp, rsp
537 SHADOW_ARGS_TO_STACK 7
538 SAVE_XMM 7
539 GET_GOT rbx
540 push rsi
541 push rdi
542 ; end prolog
543
544 %if ABI_IS_32BIT=0
545 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
546 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
547 %endif
548
549 pxor xmm6, xmm6 ; error accumulator
550 pxor xmm7, xmm7 ; sse eaccumulator
551 mov rsi, arg(0) ;ref_ptr ;
552
553 mov rdi, arg(2) ;src_ptr ;
554 movsxd rcx, dword ptr arg(4) ;Height ;
555 movsxd rax, dword ptr arg(1) ;ref_pixels_per_line
556
557 pxor xmm0, xmm0 ;
558 .half_vert_variance8x_h_1:
559 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
560 movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9
561
562 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
563 punpcklbw xmm5, xmm0 ; xmm5 = words of above
564
565 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
566 punpcklbw xmm3, xmm0 ; xmm3 = words of above
567
568 psubw xmm5, xmm3 ; xmm5 -= xmm3
569 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
570 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
571 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
572
573 %if ABI_IS_32BIT
574 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
575 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
576 %else
577 add rsi, r8
578 add rdi, r9
579 %endif
580
581 sub rcx, 1 ;
582 jnz .half_vert_variance8x_h_1 ;
583
584 movdq2q mm6, xmm6 ;
585 movdq2q mm7, xmm7 ;
586
587 psrldq xmm6, 8
588 psrldq xmm7, 8
589
590 movdq2q mm2, xmm6
591 movdq2q mm3, xmm7
592
593 paddw mm6, mm2
594 paddd mm7, mm3
595
596 pxor mm3, mm3 ;
597 pxor mm2, mm2 ;
598
599 punpcklwd mm2, mm6 ;
600 punpckhwd mm3, mm6 ;
601
602 paddd mm2, mm3 ;
603 movq mm6, mm2 ;
604
605 psrlq mm6, 32 ;
606 paddd mm2, mm6 ;
607
608 psrad mm2, 16 ;
609 movq mm4, mm7 ;
610
611 psrlq mm4, 32 ;
612 paddd mm4, mm7 ;
613
614 mov rsi, arg(5) ; sum
615 mov rdi, arg(6) ; sumsquared
616
617 movd [rsi], mm2 ;
618 movd [rdi], mm4 ;
619
620
621 ; begin epilog
622 pop rdi
623 pop rsi
624 RESTORE_GOT
625 RESTORE_XMM
626 UNSHADOW_ARGS
627 pop rbp
628 ret
629
630
631 ;void vp9_half_horiz_variance8x_h_sse2
632 ;(
633 ; unsigned char *ref_ptr,
634 ; int ref_pixels_per_line,
635 ; unsigned char *src_ptr,
636 ; int src_pixels_per_line,
637 ; unsigned int Height,
638 ; int *sum,
639 ; unsigned int *sumsquared
640 ;)
641 global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE
642 sym(vp9_half_horiz_variance8x_h_sse2):
643 push rbp
644 mov rbp, rsp
645 SHADOW_ARGS_TO_STACK 7
646 SAVE_XMM 7
647 GET_GOT rbx
648 push rsi
649 push rdi
650 ; end prolog
651
652 %if ABI_IS_32BIT=0
653 movsxd r8, dword ptr arg(1) ;ref_pixels_per_line
654 movsxd r9, dword ptr arg(3) ;src_pixels_per_line
655 %endif
656
657 pxor xmm6, xmm6 ; error accumulator
658 pxor xmm7, xmm7 ; sse eaccumulator
659 mov rsi, arg(0) ;ref_ptr ;
660
661 mov rdi, arg(2) ;src_ptr ;
662 movsxd rcx, dword ptr arg(4) ;Height ;
663
664 pxor xmm0, xmm0 ;
665 .half_horiz_variance8x_h_1:
666 movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8
667 movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9
668
669 pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3)
670 punpcklbw xmm5, xmm0 ; xmm5 = words of above
671
672 movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8
673 punpcklbw xmm3, xmm0 ; xmm3 = words of above
674
675 psubw xmm5, xmm3 ; xmm5 -= xmm3
676 paddw xmm6, xmm5 ; xmm6 += accumulated column differences
677 pmaddwd xmm5, xmm5 ; xmm5 *= xmm5
678 paddd xmm7, xmm5 ; xmm7 += accumulated square column differences
679
680 %if ABI_IS_32BIT
681 add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source
682 add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination
683 %else
684 add rsi, r8
685 add rdi, r9
686 %endif
687 sub rcx, 1 ;
688 jnz .half_horiz_variance8x_h_1 ;
689
690 movdq2q mm6, xmm6 ;
691 movdq2q mm7, xmm7 ;
692
693 psrldq xmm6, 8
694 psrldq xmm7, 8
695
696 movdq2q mm2, xmm6
697 movdq2q mm3, xmm7
698
699 paddw mm6, mm2
700 paddd mm7, mm3
701
702 pxor mm3, mm3 ;
703 pxor mm2, mm2 ;
704
705 punpcklwd mm2, mm6 ;
706 punpckhwd mm3, mm6 ;
707
708 paddd mm2, mm3 ;
709 movq mm6, mm2 ;
710
711 psrlq mm6, 32 ;
712 paddd mm2, mm6 ;
713
714 psrad mm2, 16 ;
715 movq mm4, mm7 ;
716
717 psrlq mm4, 32 ;
718 paddd mm4, mm7 ;
719
720 mov rsi, arg(5) ; sum
721 mov rdi, arg(6) ; sumsquared
722
723 movd [rsi], mm2 ;
724 movd [rdi], mm4 ;
725
726
727 ; begin epilog
728 pop rdi
729 pop rsi
730 RESTORE_GOT
731 RESTORE_XMM
732 UNSHADOW_ARGS
733 pop rbp
734 ret

mercurial