michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) michael@0: global sym(vp8_block_error_xmm) PRIVATE michael@0: sym(vp8_block_error_xmm): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 2 michael@0: push rsi michael@0: push rdi michael@0: ; end prologue michael@0: michael@0: mov rsi, arg(0) ;coeff_ptr michael@0: mov rdi, arg(1) ;dcoef_ptr michael@0: michael@0: movdqa xmm0, [rsi] michael@0: movdqa xmm1, [rdi] michael@0: michael@0: movdqa xmm2, [rsi+16] michael@0: movdqa xmm3, [rdi+16] michael@0: michael@0: psubw xmm0, xmm1 michael@0: psubw xmm2, xmm3 michael@0: michael@0: pmaddwd xmm0, xmm0 michael@0: pmaddwd xmm2, xmm2 michael@0: michael@0: paddd xmm0, xmm2 michael@0: michael@0: pxor xmm5, xmm5 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: punpckldq xmm0, xmm5 michael@0: punpckhdq xmm1, xmm5 michael@0: michael@0: paddd xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: psrldq xmm0, 8 michael@0: paddd xmm0, xmm1 michael@0: michael@0: movq rax, xmm0 michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) michael@0: global sym(vp8_block_error_mmx) PRIVATE michael@0: sym(vp8_block_error_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 2 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rsi, arg(0) ;coeff_ptr michael@0: pxor mm7, mm7 michael@0: michael@0: mov rdi, arg(1) ;dcoef_ptr michael@0: movq mm3, [rsi] michael@0: michael@0: movq mm4, [rdi] michael@0: movq mm5, [rsi+8] michael@0: michael@0: movq mm6, [rdi+8] michael@0: pxor mm1, mm1 ; from movd mm1, dc ; dc =0 michael@0: michael@0: movq mm2, mm7 michael@0: psubw mm5, mm6 michael@0: michael@0: por mm1, mm2 michael@0: pmaddwd mm5, mm5 michael@0: michael@0: pcmpeqw mm1, mm7 michael@0: psubw mm3, mm4 michael@0: michael@0: pand mm1, mm3 michael@0: pmaddwd mm1, mm1 michael@0: michael@0: paddd mm1, mm5 michael@0: movq mm3, [rsi+16] michael@0: michael@0: movq mm4, [rdi+16] michael@0: movq mm5, [rsi+24] michael@0: michael@0: movq mm6, [rdi+24] michael@0: psubw mm5, mm6 michael@0: michael@0: pmaddwd mm5, mm5 michael@0: psubw mm3, mm4 michael@0: michael@0: pmaddwd mm3, mm3 michael@0: paddd mm3, mm5 michael@0: michael@0: paddd mm1, mm3 michael@0: movq mm0, mm1 michael@0: michael@0: psrlq mm1, 32 michael@0: paddd mm0, mm1 michael@0: michael@0: movq rax, mm0 michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); michael@0: global sym(vp8_mbblock_error_mmx_impl) PRIVATE michael@0: sym(vp8_mbblock_error_mmx_impl): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 3 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rsi, arg(0) ;coeff_ptr michael@0: pxor mm7, mm7 michael@0: michael@0: mov rdi, arg(1) ;dcoef_ptr michael@0: pxor mm2, mm2 michael@0: michael@0: movd mm1, dword ptr arg(2) ;dc michael@0: por mm1, mm2 michael@0: michael@0: pcmpeqw mm1, mm7 michael@0: mov rcx, 16 michael@0: michael@0: .mberror_loop_mmx: michael@0: movq mm3, [rsi] michael@0: movq mm4, [rdi] michael@0: michael@0: movq mm5, [rsi+8] michael@0: movq mm6, [rdi+8] michael@0: michael@0: michael@0: psubw mm5, mm6 michael@0: pmaddwd mm5, mm5 michael@0: michael@0: psubw mm3, mm4 michael@0: pand mm3, mm1 michael@0: michael@0: pmaddwd mm3, mm3 michael@0: paddd mm2, mm5 michael@0: michael@0: paddd mm2, mm3 michael@0: movq mm3, [rsi+16] michael@0: michael@0: movq mm4, [rdi+16] michael@0: movq mm5, [rsi+24] michael@0: michael@0: movq mm6, [rdi+24] michael@0: psubw mm5, mm6 michael@0: michael@0: pmaddwd mm5, mm5 michael@0: psubw mm3, mm4 michael@0: michael@0: pmaddwd mm3, mm3 michael@0: paddd mm2, mm5 michael@0: michael@0: paddd mm2, mm3 michael@0: add rsi, 32 michael@0: michael@0: add rdi, 32 michael@0: sub rcx, 1 michael@0: michael@0: jnz .mberror_loop_mmx michael@0: michael@0: movq mm0, mm2 michael@0: psrlq mm2, 32 michael@0: michael@0: paddd mm0, mm2 michael@0: movq rax, mm0 michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); michael@0: global sym(vp8_mbblock_error_xmm_impl) PRIVATE michael@0: sym(vp8_mbblock_error_xmm_impl): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 3 michael@0: SAVE_XMM 6 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rsi, arg(0) ;coeff_ptr michael@0: pxor xmm6, xmm6 michael@0: michael@0: mov rdi, arg(1) ;dcoef_ptr michael@0: pxor xmm4, xmm4 michael@0: michael@0: movd xmm5, dword ptr arg(2) ;dc michael@0: por xmm5, xmm4 michael@0: michael@0: pcmpeqw xmm5, xmm6 michael@0: mov rcx, 16 michael@0: michael@0: .mberror_loop: michael@0: movdqa xmm0, [rsi] michael@0: movdqa xmm1, [rdi] michael@0: michael@0: movdqa xmm2, [rsi+16] michael@0: movdqa xmm3, [rdi+16] michael@0: michael@0: michael@0: psubw xmm2, xmm3 michael@0: pmaddwd xmm2, xmm2 michael@0: michael@0: psubw xmm0, xmm1 michael@0: pand xmm0, xmm5 michael@0: michael@0: pmaddwd xmm0, xmm0 michael@0: add rsi, 32 michael@0: michael@0: add rdi, 32 michael@0: michael@0: sub rcx, 1 michael@0: paddd xmm4, xmm2 michael@0: michael@0: paddd xmm4, xmm0 michael@0: jnz .mberror_loop michael@0: michael@0: movdqa xmm0, xmm4 michael@0: punpckldq xmm0, xmm6 michael@0: michael@0: punpckhdq xmm4, xmm6 michael@0: paddd xmm0, xmm4 michael@0: michael@0: movdqa xmm1, xmm0 michael@0: psrldq xmm0, 8 michael@0: michael@0: paddd xmm0, xmm1 michael@0: movq rax, xmm0 michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); michael@0: global sym(vp8_mbuverror_mmx_impl) PRIVATE michael@0: sym(vp8_mbuverror_mmx_impl): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 2 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rsi, arg(0) ;s_ptr michael@0: mov rdi, arg(1) ;d_ptr michael@0: michael@0: mov rcx, 16 michael@0: pxor mm7, mm7 michael@0: michael@0: .mbuverror_loop_mmx: michael@0: michael@0: movq mm1, [rsi] michael@0: movq mm2, [rdi] michael@0: michael@0: psubw mm1, mm2 michael@0: pmaddwd mm1, mm1 michael@0: michael@0: michael@0: movq mm3, [rsi+8] michael@0: movq mm4, [rdi+8] michael@0: michael@0: psubw mm3, mm4 michael@0: pmaddwd mm3, mm3 michael@0: michael@0: michael@0: paddd mm7, mm1 michael@0: paddd mm7, mm3 michael@0: michael@0: michael@0: add rsi, 16 michael@0: add rdi, 16 michael@0: michael@0: dec rcx michael@0: jnz .mbuverror_loop_mmx michael@0: michael@0: movq mm0, mm7 michael@0: psrlq mm7, 32 michael@0: michael@0: paddd mm0, mm7 michael@0: movq rax, mm0 michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); michael@0: global sym(vp8_mbuverror_xmm_impl) PRIVATE michael@0: sym(vp8_mbuverror_xmm_impl): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 2 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rsi, arg(0) ;s_ptr michael@0: mov rdi, arg(1) ;d_ptr michael@0: michael@0: mov rcx, 16 michael@0: pxor xmm3, xmm3 michael@0: michael@0: .mbuverror_loop: michael@0: michael@0: movdqa xmm1, [rsi] michael@0: movdqa xmm2, [rdi] michael@0: michael@0: psubw xmm1, xmm2 michael@0: pmaddwd xmm1, xmm1 michael@0: michael@0: paddd xmm3, xmm1 michael@0: michael@0: add rsi, 16 michael@0: add rdi, 16 michael@0: michael@0: dec rcx michael@0: jnz .mbuverror_loop michael@0: michael@0: pxor xmm0, xmm0 michael@0: movdqa xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: punpckldq xmm1, xmm0 michael@0: michael@0: punpckhdq xmm2, xmm0 michael@0: paddd xmm1, xmm2 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: michael@0: psrldq xmm1, 8 michael@0: paddd xmm1, xmm2 michael@0: michael@0: movq rax, xmm1 michael@0: michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret