michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %macro PROCESS_16X2X8 1 michael@0: %if %1 michael@0: movdqa xmm0, XMMWORD PTR [rsi] michael@0: movq xmm1, MMWORD PTR [rdi] michael@0: movq xmm3, MMWORD PTR [rdi+8] michael@0: movq xmm2, MMWORD PTR [rdi+16] michael@0: punpcklqdq xmm1, xmm3 michael@0: punpcklqdq xmm3, xmm2 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: mpsadbw xmm1, xmm0, 0x0 michael@0: mpsadbw xmm2, xmm0, 0x5 michael@0: michael@0: psrldq xmm0, 8 michael@0: michael@0: movdqa xmm4, xmm3 michael@0: mpsadbw xmm3, xmm0, 0x0 michael@0: mpsadbw xmm4, xmm0, 0x5 michael@0: michael@0: paddw xmm1, xmm2 michael@0: paddw xmm1, xmm3 michael@0: paddw xmm1, xmm4 michael@0: %else michael@0: movdqa xmm0, XMMWORD PTR [rsi] michael@0: movq xmm5, MMWORD PTR [rdi] michael@0: movq xmm3, MMWORD PTR [rdi+8] michael@0: movq xmm2, MMWORD PTR [rdi+16] michael@0: punpcklqdq xmm5, xmm3 michael@0: punpcklqdq xmm3, xmm2 michael@0: michael@0: movdqa xmm2, xmm5 michael@0: mpsadbw xmm5, xmm0, 0x0 michael@0: mpsadbw xmm2, xmm0, 0x5 michael@0: michael@0: psrldq xmm0, 8 michael@0: michael@0: movdqa xmm4, xmm3 michael@0: mpsadbw xmm3, xmm0, 0x0 michael@0: mpsadbw xmm4, xmm0, 0x5 michael@0: michael@0: paddw xmm5, xmm2 michael@0: paddw xmm5, xmm3 michael@0: paddw xmm5, xmm4 michael@0: michael@0: paddw xmm1, xmm5 michael@0: %endif michael@0: movdqa xmm0, XMMWORD PTR [rsi + rax] michael@0: movq xmm5, MMWORD PTR [rdi+ rdx] michael@0: movq xmm3, MMWORD PTR [rdi+ rdx+8] michael@0: movq xmm2, MMWORD PTR [rdi+ rdx+16] michael@0: punpcklqdq xmm5, xmm3 michael@0: punpcklqdq xmm3, xmm2 michael@0: michael@0: lea rsi, [rsi+rax*2] michael@0: lea rdi, [rdi+rdx*2] michael@0: michael@0: movdqa xmm2, xmm5 michael@0: mpsadbw xmm5, xmm0, 0x0 michael@0: mpsadbw xmm2, xmm0, 0x5 michael@0: michael@0: psrldq xmm0, 8 michael@0: movdqa xmm4, xmm3 michael@0: mpsadbw xmm3, xmm0, 0x0 michael@0: mpsadbw xmm4, xmm0, 0x5 michael@0: michael@0: paddw xmm5, xmm2 michael@0: paddw xmm5, xmm3 michael@0: paddw xmm5, xmm4 michael@0: michael@0: paddw xmm1, xmm5 michael@0: %endmacro michael@0: michael@0: %macro PROCESS_8X2X8 1 michael@0: %if %1 michael@0: movq xmm0, MMWORD PTR [rsi] michael@0: movq xmm1, MMWORD PTR [rdi] michael@0: movq xmm3, MMWORD PTR [rdi+8] michael@0: punpcklqdq xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm1 michael@0: mpsadbw xmm1, xmm0, 0x0 michael@0: mpsadbw xmm2, xmm0, 0x5 michael@0: paddw xmm1, xmm2 michael@0: %else michael@0: movq xmm0, MMWORD PTR [rsi] michael@0: movq xmm5, MMWORD PTR [rdi] michael@0: movq xmm3, MMWORD PTR [rdi+8] michael@0: punpcklqdq xmm5, xmm3 michael@0: michael@0: movdqa xmm2, xmm5 michael@0: mpsadbw xmm5, xmm0, 0x0 michael@0: mpsadbw xmm2, xmm0, 0x5 michael@0: paddw xmm5, xmm2 michael@0: michael@0: paddw xmm1, xmm5 michael@0: %endif michael@0: movq xmm0, MMWORD PTR [rsi + rax] michael@0: movq xmm5, MMWORD PTR [rdi+ rdx] michael@0: movq xmm3, MMWORD PTR [rdi+ rdx+8] michael@0: punpcklqdq xmm5, xmm3 michael@0: michael@0: lea rsi, [rsi+rax*2] michael@0: lea rdi, [rdi+rdx*2] michael@0: michael@0: movdqa xmm2, xmm5 michael@0: mpsadbw xmm5, xmm0, 0x0 michael@0: mpsadbw xmm2, xmm0, 0x5 michael@0: paddw xmm5, xmm2 michael@0: michael@0: paddw xmm1, xmm5 michael@0: %endmacro michael@0: michael@0: %macro PROCESS_4X2X8 1 michael@0: %if %1 michael@0: movd xmm0, [rsi] michael@0: movq xmm1, MMWORD PTR [rdi] michael@0: movq xmm3, MMWORD PTR [rdi+8] michael@0: punpcklqdq xmm1, xmm3 michael@0: michael@0: mpsadbw xmm1, xmm0, 0x0 michael@0: %else michael@0: movd xmm0, [rsi] michael@0: movq xmm5, MMWORD PTR [rdi] michael@0: movq xmm3, MMWORD PTR [rdi+8] michael@0: punpcklqdq xmm5, xmm3 michael@0: michael@0: mpsadbw xmm5, xmm0, 0x0 michael@0: michael@0: paddw xmm1, xmm5 michael@0: %endif michael@0: movd xmm0, [rsi + rax] michael@0: movq xmm5, MMWORD PTR [rdi+ rdx] michael@0: movq xmm3, MMWORD PTR [rdi+ rdx+8] michael@0: punpcklqdq xmm5, xmm3 michael@0: michael@0: lea rsi, [rsi+rax*2] michael@0: lea rdi, [rdi+rdx*2] michael@0: michael@0: mpsadbw xmm5, xmm0, 0x0 michael@0: michael@0: paddw xmm1, xmm5 michael@0: %endmacro michael@0: michael@0: %macro WRITE_AS_INTS 0 michael@0: mov rdi, arg(4) ;Results michael@0: pxor xmm0, xmm0 michael@0: movdqa xmm2, xmm1 michael@0: punpcklwd xmm1, xmm0 michael@0: punpckhwd xmm2, xmm0 michael@0: michael@0: movdqa [rdi], xmm1 michael@0: movdqa [rdi + 16], xmm2 michael@0: %endmacro michael@0: michael@0: ;void vp9_sad16x16x8_sse4( michael@0: ; const unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; const unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; unsigned short *sad_array); michael@0: global sym(vp9_sad16x16x8_sse4) PRIVATE michael@0: sym(vp9_sad16x16x8_sse4): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: PROCESS_16X2X8 1 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: michael@0: WRITE_AS_INTS michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp9_sad16x8x8_sse4( michael@0: ; const unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; const unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; unsigned short *sad_array michael@0: ;); michael@0: global sym(vp9_sad16x8x8_sse4) PRIVATE michael@0: sym(vp9_sad16x8x8_sse4): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: PROCESS_16X2X8 1 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: PROCESS_16X2X8 0 michael@0: michael@0: WRITE_AS_INTS michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp9_sad8x8x8_sse4( michael@0: ; const unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; const unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; unsigned short *sad_array michael@0: ;); michael@0: global sym(vp9_sad8x8x8_sse4) PRIVATE michael@0: sym(vp9_sad8x8x8_sse4): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: PROCESS_8X2X8 1 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: michael@0: WRITE_AS_INTS michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp9_sad8x16x8_sse4( michael@0: ; const unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; const unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; unsigned short *sad_array michael@0: ;); michael@0: global sym(vp9_sad8x16x8_sse4) PRIVATE michael@0: sym(vp9_sad8x16x8_sse4): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: PROCESS_8X2X8 1 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: PROCESS_8X2X8 0 michael@0: michael@0: WRITE_AS_INTS michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp9_sad4x4x8_c( michael@0: ; const unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; const unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; unsigned short *sad_array michael@0: ;); michael@0: global sym(vp9_sad4x4x8_sse4) PRIVATE michael@0: sym(vp9_sad4x4x8_sse4): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: PROCESS_4X2X8 1 michael@0: PROCESS_4X2X8 0 michael@0: michael@0: WRITE_AS_INTS michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: