michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;unsigned int vp9_get_mb_ss_sse2 michael@0: ;( michael@0: ; short *src_ptr michael@0: ;) michael@0: global sym(vp9_get_mb_ss_sse2) PRIVATE michael@0: sym(vp9_get_mb_ss_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 1 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: michael@0: mov rax, arg(0) ;[src_ptr] michael@0: mov rcx, 8 michael@0: pxor xmm4, xmm4 michael@0: michael@0: .NEXTROW: michael@0: movdqa xmm0, [rax] michael@0: movdqa xmm1, [rax+16] michael@0: movdqa xmm2, [rax+32] michael@0: movdqa xmm3, [rax+48] michael@0: pmaddwd xmm0, xmm0 michael@0: pmaddwd xmm1, xmm1 michael@0: pmaddwd xmm2, xmm2 michael@0: pmaddwd xmm3, xmm3 michael@0: michael@0: paddd xmm0, xmm1 michael@0: paddd xmm2, xmm3 michael@0: paddd xmm4, xmm0 michael@0: paddd xmm4, xmm2 michael@0: michael@0: add rax, 0x40 michael@0: dec rcx michael@0: ja .NEXTROW michael@0: michael@0: movdqa xmm3,xmm4 michael@0: psrldq xmm4,8 michael@0: paddd xmm4,xmm3 michael@0: movdqa xmm3,xmm4 michael@0: psrldq xmm4,4 michael@0: paddd xmm4,xmm3 michael@0: movq rax,xmm4 michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;unsigned int vp9_get16x16var_sse2 michael@0: ;( michael@0: ; unsigned char * src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char * ref_ptr, michael@0: ; int recon_stride, michael@0: ; unsigned int * SSE, michael@0: ; int * Sum michael@0: ;) michael@0: global sym(vp9_get16x16var_sse2) PRIVATE michael@0: sym(vp9_get16x16var_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;[src_ptr] michael@0: mov rdi, arg(2) ;[ref_ptr] michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;[source_stride] michael@0: movsxd rdx, DWORD PTR arg(3) ;[recon_stride] michael@0: michael@0: ; Prefetch data michael@0: lea rcx, [rax+rax*2] michael@0: prefetcht0 [rsi] michael@0: prefetcht0 [rsi+rax] michael@0: prefetcht0 [rsi+rax*2] michael@0: prefetcht0 [rsi+rcx] michael@0: lea rbx, [rsi+rax*4] michael@0: prefetcht0 [rbx] michael@0: prefetcht0 [rbx+rax] michael@0: prefetcht0 [rbx+rax*2] michael@0: prefetcht0 [rbx+rcx] michael@0: michael@0: lea rcx, [rdx+rdx*2] michael@0: prefetcht0 [rdi] michael@0: prefetcht0 [rdi+rdx] michael@0: prefetcht0 [rdi+rdx*2] michael@0: prefetcht0 [rdi+rcx] michael@0: lea rbx, [rdi+rdx*4] michael@0: prefetcht0 [rbx] michael@0: prefetcht0 [rbx+rdx] michael@0: prefetcht0 [rbx+rdx*2] michael@0: prefetcht0 [rbx+rcx] michael@0: michael@0: pxor xmm0, xmm0 ; clear xmm0 for unpack michael@0: pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs michael@0: michael@0: pxor xmm6, xmm6 ; clear xmm6 for accumulating sse michael@0: mov rcx, 16 michael@0: michael@0: .var16loop: michael@0: movdqu xmm1, XMMWORD PTR [rsi] michael@0: movdqu xmm2, XMMWORD PTR [rdi] michael@0: michael@0: prefetcht0 [rsi+rax*8] michael@0: prefetcht0 [rdi+rdx*8] michael@0: michael@0: movdqa xmm3, xmm1 michael@0: movdqa xmm4, xmm2 michael@0: michael@0: michael@0: punpcklbw xmm1, xmm0 michael@0: punpckhbw xmm3, xmm0 michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpckhbw xmm4, xmm0 michael@0: michael@0: michael@0: psubw xmm1, xmm2 michael@0: psubw xmm3, xmm4 michael@0: michael@0: paddw xmm7, xmm1 michael@0: pmaddwd xmm1, xmm1 michael@0: michael@0: paddw xmm7, xmm3 michael@0: pmaddwd xmm3, xmm3 michael@0: michael@0: paddd xmm6, xmm1 michael@0: paddd xmm6, xmm3 michael@0: michael@0: add rsi, rax michael@0: add rdi, rdx michael@0: michael@0: sub rcx, 1 michael@0: jnz .var16loop michael@0: michael@0: michael@0: movdqa xmm1, xmm6 michael@0: pxor xmm6, xmm6 michael@0: michael@0: pxor xmm5, xmm5 michael@0: punpcklwd xmm6, xmm7 michael@0: michael@0: punpckhwd xmm5, xmm7 michael@0: psrad xmm5, 16 michael@0: michael@0: psrad xmm6, 16 michael@0: paddd xmm6, xmm5 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: punpckldq xmm1, xmm0 michael@0: michael@0: punpckhdq xmm2, xmm0 michael@0: movdqa xmm7, xmm6 michael@0: michael@0: paddd xmm1, xmm2 michael@0: punpckldq xmm6, xmm0 michael@0: michael@0: punpckhdq xmm7, xmm0 michael@0: paddd xmm6, xmm7 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: movdqa xmm7, xmm6 michael@0: michael@0: psrldq xmm1, 8 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddd xmm7, xmm6 michael@0: paddd xmm1, xmm2 michael@0: michael@0: mov rax, arg(5) ;[Sum] michael@0: mov rdi, arg(4) ;[SSE] michael@0: michael@0: movd DWORD PTR [rax], xmm7 michael@0: movd DWORD PTR [rdi], xmm1 michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: pop rbx michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: michael@0: ;unsigned int vp9_get8x8var_sse2 michael@0: ;( michael@0: ; unsigned char * src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char * ref_ptr, michael@0: ; int recon_stride, michael@0: ; unsigned int * SSE, michael@0: ; int * Sum michael@0: ;) michael@0: global sym(vp9_get8x8var_sse2) PRIVATE michael@0: sym(vp9_get8x8var_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;[src_ptr] michael@0: mov rdi, arg(2) ;[ref_ptr] michael@0: michael@0: movsxd rax, DWORD PTR arg(1) ;[source_stride] michael@0: movsxd rdx, DWORD PTR arg(3) ;[recon_stride] michael@0: michael@0: pxor xmm0, xmm0 ; clear xmm0 for unpack michael@0: pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs michael@0: michael@0: movq xmm1, QWORD PTR [rsi] michael@0: movq xmm2, QWORD PTR [rdi] michael@0: michael@0: punpcklbw xmm1, xmm0 michael@0: punpcklbw xmm2, xmm0 michael@0: michael@0: psubsw xmm1, xmm2 michael@0: paddw xmm7, xmm1 michael@0: michael@0: pmaddwd xmm1, xmm1 michael@0: michael@0: movq xmm2, QWORD PTR[rsi + rax] michael@0: movq xmm3, QWORD PTR[rdi + rdx] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: michael@0: movq xmm2, QWORD PTR[rsi + rax * 2] michael@0: movq xmm3, QWORD PTR[rdi + rdx * 2] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: michael@0: lea rsi, [rsi + rax * 2] michael@0: lea rdi, [rdi + rdx * 2] michael@0: movq xmm2, QWORD PTR[rsi + rax] michael@0: movq xmm3, QWORD PTR[rdi + rdx] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: movq xmm2, QWORD PTR[rsi + rax *2] michael@0: movq xmm3, QWORD PTR[rdi + rdx *2] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: michael@0: lea rsi, [rsi + rax * 2] michael@0: lea rdi, [rdi + rdx * 2] michael@0: michael@0: michael@0: movq xmm2, QWORD PTR[rsi + rax] michael@0: movq xmm3, QWORD PTR[rdi + rdx] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: movq xmm2, QWORD PTR[rsi + rax *2] michael@0: movq xmm3, QWORD PTR[rdi + rdx *2] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: michael@0: lea rsi, [rsi + rax * 2] michael@0: lea rdi, [rdi + rdx * 2] michael@0: michael@0: movq xmm2, QWORD PTR[rsi + rax] michael@0: movq xmm3, QWORD PTR[rdi + rdx] michael@0: michael@0: punpcklbw xmm2, xmm0 michael@0: punpcklbw xmm3, xmm0 michael@0: michael@0: psubsw xmm2, xmm3 michael@0: paddw xmm7, xmm2 michael@0: michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm1, xmm2 michael@0: michael@0: michael@0: movdqa xmm6, xmm7 michael@0: punpcklwd xmm6, xmm0 michael@0: michael@0: punpckhwd xmm7, xmm0 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: paddw xmm6, xmm7 michael@0: punpckldq xmm1, xmm0 michael@0: michael@0: punpckhdq xmm2, xmm0 michael@0: movdqa xmm7, xmm6 michael@0: michael@0: paddd xmm1, xmm2 michael@0: punpckldq xmm6, xmm0 michael@0: michael@0: punpckhdq xmm7, xmm0 michael@0: paddw xmm6, xmm7 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: movdqa xmm7, xmm6 michael@0: michael@0: psrldq xmm1, 8 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm7, xmm6 michael@0: paddd xmm1, xmm2 michael@0: michael@0: mov rax, arg(5) ;[Sum] michael@0: mov rdi, arg(4) ;[SSE] michael@0: michael@0: movq rdx, xmm7 michael@0: movsx rcx, dx michael@0: michael@0: mov dword ptr [rax], ecx michael@0: movd DWORD PTR [rdi], xmm1 michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_half_horiz_vert_variance8x_h_sse2 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE michael@0: sym(vp9_half_horiz_vert_variance8x_h_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: %endif michael@0: michael@0: pxor xmm6, xmm6 ; error accumulator michael@0: pxor xmm7, xmm7 ; sse eaccumulator michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: movsxd rcx, dword ptr arg(4) ;Height ; michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: michael@0: pxor xmm0, xmm0 ; michael@0: michael@0: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 michael@0: movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 michael@0: pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source michael@0: %else michael@0: add rsi, r8 michael@0: %endif michael@0: michael@0: .half_horiz_vert_variance8x_h_1: michael@0: michael@0: movq xmm1, QWORD PTR [rsi] ; michael@0: movq xmm2, QWORD PTR [rsi+1] ; michael@0: pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 michael@0: michael@0: pavgb xmm5, xmm1 ; xmm = vertical average of the above michael@0: punpcklbw xmm5, xmm0 ; xmm5 = words of above michael@0: michael@0: movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 michael@0: punpcklbw xmm3, xmm0 ; xmm3 = words of above michael@0: michael@0: psubw xmm5, xmm3 ; xmm5 -= xmm3 michael@0: paddw xmm6, xmm5 ; xmm6 += accumulated column differences michael@0: pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 michael@0: paddd xmm7, xmm5 ; xmm7 += accumulated square column differences michael@0: michael@0: movdqa xmm5, xmm1 ; save xmm1 for use on the next row michael@0: michael@0: %if ABI_IS_32BIT michael@0: add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source michael@0: add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination michael@0: %else michael@0: add rsi, r8 michael@0: add rdi, r9 michael@0: %endif michael@0: michael@0: sub rcx, 1 ; michael@0: jnz .half_horiz_vert_variance8x_h_1 ; michael@0: michael@0: movdq2q mm6, xmm6 ; michael@0: movdq2q mm7, xmm7 ; michael@0: michael@0: psrldq xmm6, 8 michael@0: psrldq xmm7, 8 michael@0: michael@0: movdq2q mm2, xmm6 michael@0: movdq2q mm3, xmm7 michael@0: michael@0: paddw mm6, mm2 michael@0: paddd mm7, mm3 michael@0: michael@0: pxor mm3, mm3 ; michael@0: pxor mm2, mm2 ; michael@0: michael@0: punpcklwd mm2, mm6 ; michael@0: punpckhwd mm3, mm6 ; michael@0: michael@0: paddd mm2, mm3 ; michael@0: movq mm6, mm2 ; michael@0: michael@0: psrlq mm6, 32 ; michael@0: paddd mm2, mm6 ; michael@0: michael@0: psrad mm2, 16 ; michael@0: movq mm4, mm7 ; michael@0: michael@0: psrlq mm4, 32 ; michael@0: paddd mm4, mm7 ; michael@0: michael@0: mov rsi, arg(5) ; sum michael@0: mov rdi, arg(6) ; sumsquared michael@0: michael@0: movd [rsi], mm2 ; michael@0: movd [rdi], mm4 ; michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_half_vert_variance8x_h_sse2 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE michael@0: sym(vp9_half_vert_variance8x_h_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: %endif michael@0: michael@0: pxor xmm6, xmm6 ; error accumulator michael@0: pxor xmm7, xmm7 ; sse eaccumulator michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: movsxd rcx, dword ptr arg(4) ;Height ; michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: michael@0: pxor xmm0, xmm0 ; michael@0: .half_vert_variance8x_h_1: michael@0: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 michael@0: movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 michael@0: michael@0: pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) michael@0: punpcklbw xmm5, xmm0 ; xmm5 = words of above michael@0: michael@0: movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 michael@0: punpcklbw xmm3, xmm0 ; xmm3 = words of above michael@0: michael@0: psubw xmm5, xmm3 ; xmm5 -= xmm3 michael@0: paddw xmm6, xmm5 ; xmm6 += accumulated column differences michael@0: pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 michael@0: paddd xmm7, xmm5 ; xmm7 += accumulated square column differences michael@0: michael@0: %if ABI_IS_32BIT michael@0: add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source michael@0: add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination michael@0: %else michael@0: add rsi, r8 michael@0: add rdi, r9 michael@0: %endif michael@0: michael@0: sub rcx, 1 ; michael@0: jnz .half_vert_variance8x_h_1 ; michael@0: michael@0: movdq2q mm6, xmm6 ; michael@0: movdq2q mm7, xmm7 ; michael@0: michael@0: psrldq xmm6, 8 michael@0: psrldq xmm7, 8 michael@0: michael@0: movdq2q mm2, xmm6 michael@0: movdq2q mm3, xmm7 michael@0: michael@0: paddw mm6, mm2 michael@0: paddd mm7, mm3 michael@0: michael@0: pxor mm3, mm3 ; michael@0: pxor mm2, mm2 ; michael@0: michael@0: punpcklwd mm2, mm6 ; michael@0: punpckhwd mm3, mm6 ; michael@0: michael@0: paddd mm2, mm3 ; michael@0: movq mm6, mm2 ; michael@0: michael@0: psrlq mm6, 32 ; michael@0: paddd mm2, mm6 ; michael@0: michael@0: psrad mm2, 16 ; michael@0: movq mm4, mm7 ; michael@0: michael@0: psrlq mm4, 32 ; michael@0: paddd mm4, mm7 ; michael@0: michael@0: mov rsi, arg(5) ; sum michael@0: mov rdi, arg(6) ; sumsquared michael@0: michael@0: movd [rsi], mm2 ; michael@0: movd [rdi], mm4 ; michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp9_half_horiz_variance8x_h_sse2 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE michael@0: sym(vp9_half_horiz_variance8x_h_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: %endif michael@0: michael@0: pxor xmm6, xmm6 ; error accumulator michael@0: pxor xmm7, xmm7 ; sse eaccumulator michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: movsxd rcx, dword ptr arg(4) ;Height ; michael@0: michael@0: pxor xmm0, xmm0 ; michael@0: .half_horiz_variance8x_h_1: michael@0: movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 michael@0: movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 michael@0: michael@0: pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) michael@0: punpcklbw xmm5, xmm0 ; xmm5 = words of above michael@0: michael@0: movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 michael@0: punpcklbw xmm3, xmm0 ; xmm3 = words of above michael@0: michael@0: psubw xmm5, xmm3 ; xmm5 -= xmm3 michael@0: paddw xmm6, xmm5 ; xmm6 += accumulated column differences michael@0: pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 michael@0: paddd xmm7, xmm5 ; xmm7 += accumulated square column differences michael@0: michael@0: %if ABI_IS_32BIT michael@0: add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source michael@0: add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination michael@0: %else michael@0: add rsi, r8 michael@0: add rdi, r9 michael@0: %endif michael@0: sub rcx, 1 ; michael@0: jnz .half_horiz_variance8x_h_1 ; michael@0: michael@0: movdq2q mm6, xmm6 ; michael@0: movdq2q mm7, xmm7 ; michael@0: michael@0: psrldq xmm6, 8 michael@0: psrldq xmm7, 8 michael@0: michael@0: movdq2q mm2, xmm6 michael@0: movdq2q mm3, xmm7 michael@0: michael@0: paddw mm6, mm2 michael@0: paddd mm7, mm3 michael@0: michael@0: pxor mm3, mm3 ; michael@0: pxor mm2, mm2 ; michael@0: michael@0: punpcklwd mm2, mm6 ; michael@0: punpckhwd mm3, mm6 ; michael@0: michael@0: paddd mm2, mm3 ; michael@0: movq mm6, mm2 ; michael@0: michael@0: psrlq mm6, 32 ; michael@0: paddd mm2, mm6 ; michael@0: michael@0: psrad mm2, 16 ; michael@0: movq mm4, mm7 ; michael@0: michael@0: psrlq mm4, 32 ; michael@0: paddd mm4, mm7 ; michael@0: michael@0: mov rsi, arg(5) ; sum michael@0: mov rdi, arg(6) ; sumsquared michael@0: michael@0: movd [rsi], mm2 ; michael@0: movd [rdi], mm4 ; michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret