michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;void vp9_half_horiz_vert_variance16x_h_sse2 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE michael@0: sym(vp9_half_horiz_vert_variance16x_h_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: pxor xmm6, xmm6 ; error accumulator michael@0: pxor xmm7, xmm7 ; sse eaccumulator michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: movsxd rcx, dword ptr arg(4) ;Height ; michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;src_pixels_per_line michael@0: michael@0: pxor xmm0, xmm0 ; michael@0: michael@0: movdqu xmm5, XMMWORD PTR [rsi] michael@0: movdqu xmm3, XMMWORD PTR [rsi+1] michael@0: pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: michael@0: .half_horiz_vert_variance16x_h_1: michael@0: movdqu xmm1, XMMWORD PTR [rsi] ; michael@0: movdqu xmm2, XMMWORD PTR [rsi+1] ; michael@0: pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 michael@0: michael@0: pavgb xmm5, xmm1 ; xmm = vertical average of the above michael@0: michael@0: movdqa xmm4, xmm5 michael@0: punpcklbw xmm5, xmm0 ; xmm5 = words of above michael@0: punpckhbw xmm4, xmm0 michael@0: michael@0: movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 michael@0: punpcklbw xmm3, xmm0 ; xmm3 = words of above michael@0: psubw xmm5, xmm3 ; xmm5 -= xmm3 michael@0: michael@0: movq xmm3, QWORD PTR [rdi+8] michael@0: punpcklbw xmm3, xmm0 michael@0: psubw xmm4, xmm3 michael@0: michael@0: paddw xmm6, xmm5 ; xmm6 += accumulated column differences michael@0: paddw xmm6, xmm4 michael@0: pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 michael@0: pmaddwd xmm4, xmm4 michael@0: paddd xmm7, xmm5 ; xmm7 += accumulated square column differences michael@0: paddd xmm7, xmm4 michael@0: michael@0: movdqa xmm5, xmm1 ; save xmm1 for use on the next row michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: michael@0: sub rcx, 1 ; michael@0: jnz .half_horiz_vert_variance16x_h_1 ; michael@0: michael@0: pxor xmm1, xmm1 michael@0: pxor xmm5, xmm5 michael@0: michael@0: punpcklwd xmm0, xmm6 michael@0: punpckhwd xmm1, xmm6 michael@0: psrad xmm0, 16 michael@0: psrad xmm1, 16 michael@0: paddd xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: movdqa xmm6, xmm7 michael@0: punpckldq xmm6, xmm5 michael@0: punpckhdq xmm7, xmm5 michael@0: paddd xmm6, xmm7 michael@0: michael@0: punpckldq xmm0, xmm5 michael@0: punpckhdq xmm1, xmm5 michael@0: paddd xmm0, xmm1 michael@0: michael@0: movdqa xmm7, xmm6 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: psrldq xmm7, 8 michael@0: psrldq xmm1, 8 michael@0: michael@0: paddd xmm6, xmm7 michael@0: paddd xmm0, xmm1 michael@0: michael@0: mov rsi, arg(5) ;[Sum] michael@0: mov rdi, arg(6) ;[SSE] michael@0: michael@0: movd [rsi], xmm0 michael@0: movd [rdi], xmm6 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_half_vert_variance16x_h_sse2 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE michael@0: sym(vp9_half_vert_variance16x_h_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: pxor xmm6, xmm6 ; error accumulator michael@0: pxor xmm7, xmm7 ; sse eaccumulator michael@0: mov rsi, arg(0) ;ref_ptr michael@0: michael@0: mov rdi, arg(2) ;src_ptr michael@0: movsxd rcx, dword ptr arg(4) ;Height michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;src_pixels_per_line michael@0: michael@0: movdqu xmm5, XMMWORD PTR [rsi] michael@0: lea rsi, [rsi + rax ] michael@0: pxor xmm0, xmm0 michael@0: michael@0: .half_vert_variance16x_h_1: michael@0: movdqu xmm3, XMMWORD PTR [rsi] michael@0: michael@0: pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) michael@0: movdqa xmm4, xmm5 michael@0: punpcklbw xmm5, xmm0 michael@0: punpckhbw xmm4, xmm0 michael@0: michael@0: movq xmm2, QWORD PTR [rdi] michael@0: punpcklbw xmm2, xmm0 michael@0: psubw xmm5, xmm2 michael@0: movq xmm2, QWORD PTR [rdi+8] michael@0: punpcklbw xmm2, xmm0 michael@0: psubw xmm4, xmm2 michael@0: michael@0: paddw xmm6, xmm5 ; xmm6 += accumulated column differences michael@0: paddw xmm6, xmm4 michael@0: pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 michael@0: pmaddwd xmm4, xmm4 michael@0: paddd xmm7, xmm5 ; xmm7 += accumulated square column differences michael@0: paddd xmm7, xmm4 michael@0: michael@0: movdqa xmm5, xmm3 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: michael@0: sub rcx, 1 michael@0: jnz .half_vert_variance16x_h_1 michael@0: michael@0: pxor xmm1, xmm1 michael@0: pxor xmm5, xmm5 michael@0: michael@0: punpcklwd xmm0, xmm6 michael@0: punpckhwd xmm1, xmm6 michael@0: psrad xmm0, 16 michael@0: psrad xmm1, 16 michael@0: paddd xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: movdqa xmm6, xmm7 michael@0: punpckldq xmm6, xmm5 michael@0: punpckhdq xmm7, xmm5 michael@0: paddd xmm6, xmm7 michael@0: michael@0: punpckldq xmm0, xmm5 michael@0: punpckhdq xmm1, xmm5 michael@0: paddd xmm0, xmm1 michael@0: michael@0: movdqa xmm7, xmm6 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: psrldq xmm7, 8 michael@0: psrldq xmm1, 8 michael@0: michael@0: paddd xmm6, xmm7 michael@0: paddd xmm0, xmm1 michael@0: michael@0: mov rsi, arg(5) ;[Sum] michael@0: mov rdi, arg(6) ;[SSE] michael@0: michael@0: movd [rsi], xmm0 michael@0: movd [rdi], xmm6 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_half_horiz_variance16x_h_sse2 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE michael@0: sym(vp9_half_horiz_variance16x_h_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: pxor xmm6, xmm6 ; error accumulator michael@0: pxor xmm7, xmm7 ; sse eaccumulator michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: movsxd rcx, dword ptr arg(4) ;Height ; michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;src_pixels_per_line michael@0: michael@0: pxor xmm0, xmm0 ; michael@0: michael@0: .half_horiz_variance16x_h_1: michael@0: movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 michael@0: movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 michael@0: michael@0: pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) michael@0: movdqa xmm1, xmm5 michael@0: punpcklbw xmm5, xmm0 ; xmm5 = words of above michael@0: punpckhbw xmm1, xmm0 michael@0: michael@0: movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 michael@0: punpcklbw xmm3, xmm0 ; xmm3 = words of above michael@0: movq xmm2, QWORD PTR [rdi+8] michael@0: punpcklbw xmm2, xmm0 michael@0: michael@0: psubw xmm5, xmm3 ; xmm5 -= xmm3 michael@0: psubw xmm1, xmm2 michael@0: paddw xmm6, xmm5 ; xmm6 += accumulated column differences michael@0: paddw xmm6, xmm1 michael@0: pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 michael@0: pmaddwd xmm1, xmm1 michael@0: paddd xmm7, xmm5 ; xmm7 += accumulated square column differences michael@0: paddd xmm7, xmm1 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + rdx] michael@0: michael@0: sub rcx, 1 ; michael@0: jnz .half_horiz_variance16x_h_1 ; michael@0: michael@0: pxor xmm1, xmm1 michael@0: pxor xmm5, xmm5 michael@0: michael@0: punpcklwd xmm0, xmm6 michael@0: punpckhwd xmm1, xmm6 michael@0: psrad xmm0, 16 michael@0: psrad xmm1, 16 michael@0: paddd xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: movdqa xmm6, xmm7 michael@0: punpckldq xmm6, xmm5 michael@0: punpckhdq xmm7, xmm5 michael@0: paddd xmm6, xmm7 michael@0: michael@0: punpckldq xmm0, xmm5 michael@0: punpckhdq xmm1, xmm5 michael@0: paddd xmm0, xmm1 michael@0: michael@0: movdqa xmm7, xmm6 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: psrldq xmm7, 8 michael@0: psrldq xmm1, 8 michael@0: michael@0: paddd xmm6, xmm7 michael@0: paddd xmm0, xmm1 michael@0: michael@0: mov rsi, arg(5) ;[Sum] michael@0: mov rdi, arg(6) ;[SSE] michael@0: michael@0: movd [rsi], xmm0 michael@0: movd [rdi], xmm6 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret