michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %macro STACK_FRAME_CREATE_X3 0 michael@0: %if ABI_IS_32BIT michael@0: %define src_ptr rsi michael@0: %define src_stride rax michael@0: %define ref_ptr rdi michael@0: %define ref_stride rdx michael@0: %define end_ptr rcx michael@0: %define ret_var rbx michael@0: %define result_ptr arg(4) michael@0: %define max_err arg(4) michael@0: %define height dword ptr arg(4) michael@0: push rbp michael@0: mov rbp, rsp michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: michael@0: mov rsi, arg(0) ; src_ptr michael@0: mov rdi, arg(2) ; ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ; src_stride michael@0: movsxd rdx, dword ptr arg(3) ; ref_stride michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: SAVE_XMM 7, u michael@0: %define src_ptr rcx michael@0: %define src_stride rdx michael@0: %define ref_ptr r8 michael@0: %define ref_stride r9 michael@0: %define end_ptr r10 michael@0: %define ret_var r11 michael@0: %define result_ptr [rsp+xmm_stack_space+8+4*8] michael@0: %define max_err [rsp+xmm_stack_space+8+4*8] michael@0: %define height dword ptr [rsp+xmm_stack_space+8+4*8] michael@0: %else michael@0: %define src_ptr rdi michael@0: %define src_stride rsi michael@0: %define ref_ptr rdx michael@0: %define ref_stride rcx michael@0: %define end_ptr r9 michael@0: %define ret_var r10 michael@0: %define result_ptr r8 michael@0: %define max_err r8 michael@0: %define height r8 michael@0: %endif michael@0: %endif michael@0: michael@0: %endmacro michael@0: michael@0: %macro STACK_FRAME_DESTROY_X3 0 michael@0: %define src_ptr michael@0: %define src_stride michael@0: %define ref_ptr michael@0: %define ref_stride michael@0: %define end_ptr michael@0: %define ret_var michael@0: %define result_ptr michael@0: %define max_err michael@0: %define height michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: pop rbp michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: RESTORE_XMM michael@0: %endif michael@0: %endif michael@0: ret michael@0: %endmacro michael@0: michael@0: %macro PROCESS_16X2X3 5 michael@0: %if %1==0 michael@0: movdqa xmm0, XMMWORD PTR [%2] michael@0: lddqu xmm5, XMMWORD PTR [%3] michael@0: lddqu xmm6, XMMWORD PTR [%3+1] michael@0: lddqu xmm7, XMMWORD PTR [%3+2] michael@0: michael@0: psadbw xmm5, xmm0 michael@0: psadbw xmm6, xmm0 michael@0: psadbw xmm7, xmm0 michael@0: %else michael@0: movdqa xmm0, XMMWORD PTR [%2] michael@0: lddqu xmm1, XMMWORD PTR [%3] michael@0: lddqu xmm2, XMMWORD PTR [%3+1] michael@0: lddqu xmm3, XMMWORD PTR [%3+2] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endif michael@0: movdqa xmm0, XMMWORD PTR [%2+%4] michael@0: lddqu xmm1, XMMWORD PTR [%3+%5] michael@0: lddqu xmm2, XMMWORD PTR [%3+%5+1] michael@0: lddqu xmm3, XMMWORD PTR [%3+%5+2] michael@0: michael@0: %if %1==0 || %1==1 michael@0: lea %2, [%2+%4*2] michael@0: lea %3, [%3+%5*2] michael@0: %endif michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endmacro michael@0: michael@0: %macro PROCESS_8X2X3 5 michael@0: %if %1==0 michael@0: movq mm0, QWORD PTR [%2] michael@0: movq mm5, QWORD PTR [%3] michael@0: movq mm6, QWORD PTR [%3+1] michael@0: movq mm7, QWORD PTR [%3+2] michael@0: michael@0: psadbw mm5, mm0 michael@0: psadbw mm6, mm0 michael@0: psadbw mm7, mm0 michael@0: %else michael@0: movq mm0, QWORD PTR [%2] michael@0: movq mm1, QWORD PTR [%3] michael@0: movq mm2, QWORD PTR [%3+1] michael@0: movq mm3, QWORD PTR [%3+2] michael@0: michael@0: psadbw mm1, mm0 michael@0: psadbw mm2, mm0 michael@0: psadbw mm3, mm0 michael@0: michael@0: paddw mm5, mm1 michael@0: paddw mm6, mm2 michael@0: paddw mm7, mm3 michael@0: %endif michael@0: movq mm0, QWORD PTR [%2+%4] michael@0: movq mm1, QWORD PTR [%3+%5] michael@0: movq mm2, QWORD PTR [%3+%5+1] michael@0: movq mm3, QWORD PTR [%3+%5+2] michael@0: michael@0: %if %1==0 || %1==1 michael@0: lea %2, [%2+%4*2] michael@0: lea %3, [%3+%5*2] michael@0: %endif michael@0: michael@0: psadbw mm1, mm0 michael@0: psadbw mm2, mm0 michael@0: psadbw mm3, mm0 michael@0: michael@0: paddw mm5, mm1 michael@0: paddw mm6, mm2 michael@0: paddw mm7, mm3 michael@0: %endmacro michael@0: michael@0: ;void int vp9_sad16x16x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad16x16x3_sse3) PRIVATE michael@0: sym(vp9_sad16x16x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rcx], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rcx+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rcx+8], xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp9_sad16x8x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad16x8x3_sse3) PRIVATE michael@0: sym(vp9_sad16x8x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rcx], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rcx+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rcx+8], xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp9_sad8x16x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad8x16x3_sse3) PRIVATE michael@0: sym(vp9_sad8x16x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm5, mm6 michael@0: michael@0: movq [rcx], mm5 michael@0: movd [rcx+8], mm7 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp9_sad8x8x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad8x8x3_sse3) PRIVATE michael@0: sym(vp9_sad8x8x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm5, mm6 michael@0: michael@0: movq [rcx], mm5 michael@0: movd [rcx+8], mm7 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp9_sad4x4x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad4x4x3_sse3) PRIVATE michael@0: sym(vp9_sad4x4x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: movd mm0, DWORD PTR [src_ptr] michael@0: movd mm1, DWORD PTR [ref_ptr] michael@0: michael@0: movd mm2, DWORD PTR [src_ptr+src_stride] michael@0: movd mm3, DWORD PTR [ref_ptr+ref_stride] michael@0: michael@0: punpcklbw mm0, mm2 michael@0: punpcklbw mm1, mm3 michael@0: michael@0: movd mm4, DWORD PTR [ref_ptr+1] michael@0: movd mm5, DWORD PTR [ref_ptr+2] michael@0: michael@0: movd mm2, DWORD PTR [ref_ptr+ref_stride+1] michael@0: movd mm3, DWORD PTR [ref_ptr+ref_stride+2] michael@0: michael@0: psadbw mm1, mm0 michael@0: michael@0: punpcklbw mm4, mm2 michael@0: punpcklbw mm5, mm3 michael@0: michael@0: psadbw mm4, mm0 michael@0: psadbw mm5, mm0 michael@0: michael@0: lea src_ptr, [src_ptr+src_stride*2] michael@0: lea ref_ptr, [ref_ptr+ref_stride*2] michael@0: michael@0: movd mm0, DWORD PTR [src_ptr] michael@0: movd mm2, DWORD PTR [ref_ptr] michael@0: michael@0: movd mm3, DWORD PTR [src_ptr+src_stride] michael@0: movd mm6, DWORD PTR [ref_ptr+ref_stride] michael@0: michael@0: punpcklbw mm0, mm3 michael@0: punpcklbw mm2, mm6 michael@0: michael@0: movd mm3, DWORD PTR [ref_ptr+1] michael@0: movd mm7, DWORD PTR [ref_ptr+2] michael@0: michael@0: psadbw mm2, mm0 michael@0: michael@0: paddw mm1, mm2 michael@0: michael@0: movd mm2, DWORD PTR [ref_ptr+ref_stride+1] michael@0: movd mm6, DWORD PTR [ref_ptr+ref_stride+2] michael@0: michael@0: punpcklbw mm3, mm2 michael@0: punpcklbw mm7, mm6 michael@0: michael@0: psadbw mm3, mm0 michael@0: psadbw mm7, mm0 michael@0: michael@0: paddw mm3, mm4 michael@0: paddw mm7, mm5 michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm1, mm3 michael@0: michael@0: movq [rcx], mm1 michael@0: movd [rcx+8], mm7 michael@0: michael@0: STACK_FRAME_DESTROY_X3