michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %macro STACK_FRAME_CREATE_X3 0 michael@0: %if ABI_IS_32BIT michael@0: %define src_ptr rsi michael@0: %define src_stride rax michael@0: %define ref_ptr rdi michael@0: %define ref_stride rdx michael@0: %define end_ptr rcx michael@0: %define ret_var rbx michael@0: %define result_ptr arg(4) michael@0: %define max_sad arg(4) michael@0: %define height dword ptr arg(4) michael@0: push rbp michael@0: mov rbp, rsp michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: michael@0: mov rsi, arg(0) ; src_ptr michael@0: mov rdi, arg(2) ; ref_ptr michael@0: michael@0: movsxd rax, dword ptr arg(1) ; src_stride michael@0: movsxd rdx, dword ptr arg(3) ; ref_stride michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: SAVE_XMM 7, u michael@0: %define src_ptr rcx michael@0: %define src_stride rdx michael@0: %define ref_ptr r8 michael@0: %define ref_stride r9 michael@0: %define end_ptr r10 michael@0: %define ret_var r11 michael@0: %define result_ptr [rsp+xmm_stack_space+8+4*8] michael@0: %define max_sad [rsp+xmm_stack_space+8+4*8] michael@0: %define height dword ptr [rsp+xmm_stack_space+8+4*8] michael@0: %else michael@0: %define src_ptr rdi michael@0: %define src_stride rsi michael@0: %define ref_ptr rdx michael@0: %define ref_stride rcx michael@0: %define end_ptr r9 michael@0: %define ret_var r10 michael@0: %define result_ptr r8 michael@0: %define max_sad r8 michael@0: %define height r8 michael@0: %endif michael@0: %endif michael@0: michael@0: %endmacro michael@0: michael@0: %macro STACK_FRAME_DESTROY_X3 0 michael@0: %define src_ptr michael@0: %define src_stride michael@0: %define ref_ptr michael@0: %define ref_stride michael@0: %define end_ptr michael@0: %define ret_var michael@0: %define result_ptr michael@0: %define max_sad michael@0: %define height michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: pop rbp michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: RESTORE_XMM michael@0: %endif michael@0: %endif michael@0: ret michael@0: %endmacro michael@0: michael@0: %macro STACK_FRAME_CREATE_X4 0 michael@0: %if ABI_IS_32BIT michael@0: %define src_ptr rsi michael@0: %define src_stride rax michael@0: %define r0_ptr rcx michael@0: %define r1_ptr rdx michael@0: %define r2_ptr rbx michael@0: %define r3_ptr rdi michael@0: %define ref_stride rbp michael@0: %define result_ptr arg(4) michael@0: push rbp michael@0: mov rbp, rsp michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: michael@0: push rbp michael@0: mov rdi, arg(2) ; ref_ptr_base michael@0: michael@0: LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi michael@0: michael@0: mov rsi, arg(0) ; src_ptr michael@0: michael@0: movsxd rbx, dword ptr arg(1) ; src_stride michael@0: movsxd rbp, dword ptr arg(3) ; ref_stride michael@0: michael@0: xchg rbx, rax michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: SAVE_XMM 7, u michael@0: %define src_ptr rcx michael@0: %define src_stride rdx michael@0: %define r0_ptr rsi michael@0: %define r1_ptr r10 michael@0: %define r2_ptr r11 michael@0: %define r3_ptr r8 michael@0: %define ref_stride r9 michael@0: %define result_ptr [rsp+xmm_stack_space+16+4*8] michael@0: push rsi michael@0: michael@0: LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr michael@0: %else michael@0: %define src_ptr rdi michael@0: %define src_stride rsi michael@0: %define r0_ptr r9 michael@0: %define r1_ptr r10 michael@0: %define r2_ptr r11 michael@0: %define r3_ptr rdx michael@0: %define ref_stride rcx michael@0: %define result_ptr r8 michael@0: michael@0: LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr michael@0: michael@0: %endif michael@0: %endif michael@0: %endmacro michael@0: michael@0: %macro STACK_FRAME_DESTROY_X4 0 michael@0: %define src_ptr michael@0: %define src_stride michael@0: %define r0_ptr michael@0: %define r1_ptr michael@0: %define r2_ptr michael@0: %define r3_ptr michael@0: %define ref_stride michael@0: %define result_ptr michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: pop rbp michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: pop rsi michael@0: RESTORE_XMM michael@0: %endif michael@0: %endif michael@0: ret michael@0: %endmacro michael@0: michael@0: %macro PROCESS_16X2X3 5 michael@0: %if %1==0 michael@0: movdqa xmm0, XMMWORD PTR [%2] michael@0: lddqu xmm5, XMMWORD PTR [%3] michael@0: lddqu xmm6, XMMWORD PTR [%3+1] michael@0: lddqu xmm7, XMMWORD PTR [%3+2] michael@0: michael@0: psadbw xmm5, xmm0 michael@0: psadbw xmm6, xmm0 michael@0: psadbw xmm7, xmm0 michael@0: %else michael@0: movdqa xmm0, XMMWORD PTR [%2] michael@0: lddqu xmm1, XMMWORD PTR [%3] michael@0: lddqu xmm2, XMMWORD PTR [%3+1] michael@0: lddqu xmm3, XMMWORD PTR [%3+2] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endif michael@0: movdqa xmm0, XMMWORD PTR [%2+%4] michael@0: lddqu xmm1, XMMWORD PTR [%3+%5] michael@0: lddqu xmm2, XMMWORD PTR [%3+%5+1] michael@0: lddqu xmm3, XMMWORD PTR [%3+%5+2] michael@0: michael@0: %if %1==0 || %1==1 michael@0: lea %2, [%2+%4*2] michael@0: lea %3, [%3+%5*2] michael@0: %endif michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endmacro michael@0: michael@0: %macro PROCESS_8X2X3 5 michael@0: %if %1==0 michael@0: movq mm0, QWORD PTR [%2] michael@0: movq mm5, QWORD PTR [%3] michael@0: movq mm6, QWORD PTR [%3+1] michael@0: movq mm7, QWORD PTR [%3+2] michael@0: michael@0: psadbw mm5, mm0 michael@0: psadbw mm6, mm0 michael@0: psadbw mm7, mm0 michael@0: %else michael@0: movq mm0, QWORD PTR [%2] michael@0: movq mm1, QWORD PTR [%3] michael@0: movq mm2, QWORD PTR [%3+1] michael@0: movq mm3, QWORD PTR [%3+2] michael@0: michael@0: psadbw mm1, mm0 michael@0: psadbw mm2, mm0 michael@0: psadbw mm3, mm0 michael@0: michael@0: paddw mm5, mm1 michael@0: paddw mm6, mm2 michael@0: paddw mm7, mm3 michael@0: %endif michael@0: movq mm0, QWORD PTR [%2+%4] michael@0: movq mm1, QWORD PTR [%3+%5] michael@0: movq mm2, QWORD PTR [%3+%5+1] michael@0: movq mm3, QWORD PTR [%3+%5+2] michael@0: michael@0: %if %1==0 || %1==1 michael@0: lea %2, [%2+%4*2] michael@0: lea %3, [%3+%5*2] michael@0: %endif michael@0: michael@0: psadbw mm1, mm0 michael@0: psadbw mm2, mm0 michael@0: psadbw mm3, mm0 michael@0: michael@0: paddw mm5, mm1 michael@0: paddw mm6, mm2 michael@0: paddw mm7, mm3 michael@0: %endmacro michael@0: michael@0: %macro LOAD_X4_ADDRESSES 5 michael@0: mov %2, [%1+REG_SZ_BYTES*0] michael@0: mov %3, [%1+REG_SZ_BYTES*1] michael@0: michael@0: mov %4, [%1+REG_SZ_BYTES*2] michael@0: mov %5, [%1+REG_SZ_BYTES*3] michael@0: %endmacro michael@0: michael@0: %macro PROCESS_16X2X4 8 michael@0: %if %1==0 michael@0: movdqa xmm0, XMMWORD PTR [%2] michael@0: lddqu xmm4, XMMWORD PTR [%3] michael@0: lddqu xmm5, XMMWORD PTR [%4] michael@0: lddqu xmm6, XMMWORD PTR [%5] michael@0: lddqu xmm7, XMMWORD PTR [%6] michael@0: michael@0: psadbw xmm4, xmm0 michael@0: psadbw xmm5, xmm0 michael@0: psadbw xmm6, xmm0 michael@0: psadbw xmm7, xmm0 michael@0: %else michael@0: movdqa xmm0, XMMWORD PTR [%2] michael@0: lddqu xmm1, XMMWORD PTR [%3] michael@0: lddqu xmm2, XMMWORD PTR [%4] michael@0: lddqu xmm3, XMMWORD PTR [%5] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm4, xmm1 michael@0: lddqu xmm1, XMMWORD PTR [%6] michael@0: paddw xmm5, xmm2 michael@0: paddw xmm6, xmm3 michael@0: michael@0: psadbw xmm1, xmm0 michael@0: paddw xmm7, xmm1 michael@0: %endif michael@0: movdqa xmm0, XMMWORD PTR [%2+%7] michael@0: lddqu xmm1, XMMWORD PTR [%3+%8] michael@0: lddqu xmm2, XMMWORD PTR [%4+%8] michael@0: lddqu xmm3, XMMWORD PTR [%5+%8] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm4, xmm1 michael@0: lddqu xmm1, XMMWORD PTR [%6+%8] michael@0: paddw xmm5, xmm2 michael@0: paddw xmm6, xmm3 michael@0: michael@0: %if %1==0 || %1==1 michael@0: lea %2, [%2+%7*2] michael@0: lea %3, [%3+%8*2] michael@0: michael@0: lea %4, [%4+%8*2] michael@0: lea %5, [%5+%8*2] michael@0: michael@0: lea %6, [%6+%8*2] michael@0: %endif michael@0: psadbw xmm1, xmm0 michael@0: paddw xmm7, xmm1 michael@0: michael@0: %endmacro michael@0: michael@0: %macro PROCESS_8X2X4 8 michael@0: %if %1==0 michael@0: movq mm0, QWORD PTR [%2] michael@0: movq mm4, QWORD PTR [%3] michael@0: movq mm5, QWORD PTR [%4] michael@0: movq mm6, QWORD PTR [%5] michael@0: movq mm7, QWORD PTR [%6] michael@0: michael@0: psadbw mm4, mm0 michael@0: psadbw mm5, mm0 michael@0: psadbw mm6, mm0 michael@0: psadbw mm7, mm0 michael@0: %else michael@0: movq mm0, QWORD PTR [%2] michael@0: movq mm1, QWORD PTR [%3] michael@0: movq mm2, QWORD PTR [%4] michael@0: movq mm3, QWORD PTR [%5] michael@0: michael@0: psadbw mm1, mm0 michael@0: psadbw mm2, mm0 michael@0: psadbw mm3, mm0 michael@0: michael@0: paddw mm4, mm1 michael@0: movq mm1, QWORD PTR [%6] michael@0: paddw mm5, mm2 michael@0: paddw mm6, mm3 michael@0: michael@0: psadbw mm1, mm0 michael@0: paddw mm7, mm1 michael@0: %endif michael@0: movq mm0, QWORD PTR [%2+%7] michael@0: movq mm1, QWORD PTR [%3+%8] michael@0: movq mm2, QWORD PTR [%4+%8] michael@0: movq mm3, QWORD PTR [%5+%8] michael@0: michael@0: psadbw mm1, mm0 michael@0: psadbw mm2, mm0 michael@0: psadbw mm3, mm0 michael@0: michael@0: paddw mm4, mm1 michael@0: movq mm1, QWORD PTR [%6+%8] michael@0: paddw mm5, mm2 michael@0: paddw mm6, mm3 michael@0: michael@0: %if %1==0 || %1==1 michael@0: lea %2, [%2+%7*2] michael@0: lea %3, [%3+%8*2] michael@0: michael@0: lea %4, [%4+%8*2] michael@0: lea %5, [%5+%8*2] michael@0: michael@0: lea %6, [%6+%8*2] michael@0: %endif michael@0: psadbw mm1, mm0 michael@0: paddw mm7, mm1 michael@0: michael@0: %endmacro michael@0: michael@0: ;void int vp8_sad16x16x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad16x16x3_sse3) PRIVATE michael@0: sym(vp8_sad16x16x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rcx], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rcx+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rcx+8], xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp8_sad16x8x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad16x8x3_sse3) PRIVATE michael@0: sym(vp8_sad16x8x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rcx], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rcx+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rcx+8], xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp8_sad8x16x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad8x16x3_sse3) PRIVATE michael@0: sym(vp8_sad8x16x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm5, mm6 michael@0: michael@0: movq [rcx], mm5 michael@0: movd [rcx+8], mm7 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp8_sad8x8x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad8x8x3_sse3) PRIVATE michael@0: sym(vp8_sad8x8x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm5, mm6 michael@0: michael@0: movq [rcx], mm5 michael@0: movd [rcx+8], mm7 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void int vp8_sad4x4x3_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad4x4x3_sse3) PRIVATE michael@0: sym(vp8_sad4x4x3_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: movd mm0, DWORD PTR [src_ptr] michael@0: movd mm1, DWORD PTR [ref_ptr] michael@0: michael@0: movd mm2, DWORD PTR [src_ptr+src_stride] michael@0: movd mm3, DWORD PTR [ref_ptr+ref_stride] michael@0: michael@0: punpcklbw mm0, mm2 michael@0: punpcklbw mm1, mm3 michael@0: michael@0: movd mm4, DWORD PTR [ref_ptr+1] michael@0: movd mm5, DWORD PTR [ref_ptr+2] michael@0: michael@0: movd mm2, DWORD PTR [ref_ptr+ref_stride+1] michael@0: movd mm3, DWORD PTR [ref_ptr+ref_stride+2] michael@0: michael@0: psadbw mm1, mm0 michael@0: michael@0: punpcklbw mm4, mm2 michael@0: punpcklbw mm5, mm3 michael@0: michael@0: psadbw mm4, mm0 michael@0: psadbw mm5, mm0 michael@0: michael@0: lea src_ptr, [src_ptr+src_stride*2] michael@0: lea ref_ptr, [ref_ptr+ref_stride*2] michael@0: michael@0: movd mm0, DWORD PTR [src_ptr] michael@0: movd mm2, DWORD PTR [ref_ptr] michael@0: michael@0: movd mm3, DWORD PTR [src_ptr+src_stride] michael@0: movd mm6, DWORD PTR [ref_ptr+ref_stride] michael@0: michael@0: punpcklbw mm0, mm3 michael@0: punpcklbw mm2, mm6 michael@0: michael@0: movd mm3, DWORD PTR [ref_ptr+1] michael@0: movd mm7, DWORD PTR [ref_ptr+2] michael@0: michael@0: psadbw mm2, mm0 michael@0: michael@0: paddw mm1, mm2 michael@0: michael@0: movd mm2, DWORD PTR [ref_ptr+ref_stride+1] michael@0: movd mm6, DWORD PTR [ref_ptr+ref_stride+2] michael@0: michael@0: punpcklbw mm3, mm2 michael@0: punpcklbw mm7, mm6 michael@0: michael@0: psadbw mm3, mm0 michael@0: psadbw mm7, mm0 michael@0: michael@0: paddw mm3, mm4 michael@0: paddw mm7, mm5 michael@0: michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm1, mm3 michael@0: michael@0: movq [rcx], mm1 michael@0: movd [rcx+8], mm7 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;unsigned int vp8_sad16x16_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int max_sad) michael@0: ;%define lddqu movdqu michael@0: global sym(vp8_sad16x16_sse3) PRIVATE michael@0: sym(vp8_sad16x16_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: mov end_ptr, 4 michael@0: pxor xmm7, xmm7 michael@0: michael@0: .vp8_sad16x16_sse3_loop: michael@0: movdqa xmm0, XMMWORD PTR [src_ptr] michael@0: movdqu xmm1, XMMWORD PTR [ref_ptr] michael@0: movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] michael@0: movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] michael@0: michael@0: lea src_ptr, [src_ptr+src_stride*2] michael@0: lea ref_ptr, [ref_ptr+ref_stride*2] michael@0: michael@0: movdqa xmm4, XMMWORD PTR [src_ptr] michael@0: movdqu xmm5, XMMWORD PTR [ref_ptr] michael@0: movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] michael@0: michael@0: psadbw xmm0, xmm1 michael@0: michael@0: movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] michael@0: michael@0: psadbw xmm2, xmm3 michael@0: psadbw xmm4, xmm5 michael@0: psadbw xmm6, xmm1 michael@0: michael@0: lea src_ptr, [src_ptr+src_stride*2] michael@0: lea ref_ptr, [ref_ptr+ref_stride*2] michael@0: michael@0: paddw xmm7, xmm0 michael@0: paddw xmm7, xmm2 michael@0: paddw xmm7, xmm4 michael@0: paddw xmm7, xmm6 michael@0: michael@0: sub end_ptr, 1 michael@0: jne .vp8_sad16x16_sse3_loop michael@0: michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: paddw xmm0, xmm7 michael@0: movq rax, xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void vp8_copy32xn_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *dst_ptr, michael@0: ; int dst_stride, michael@0: ; int height); michael@0: global sym(vp8_copy32xn_sse3) PRIVATE michael@0: sym(vp8_copy32xn_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X3 michael@0: michael@0: .block_copy_sse3_loopx4: michael@0: lea end_ptr, [src_ptr+src_stride*2] michael@0: michael@0: movdqu xmm0, XMMWORD PTR [src_ptr] michael@0: movdqu xmm1, XMMWORD PTR [src_ptr + 16] michael@0: movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] michael@0: movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] michael@0: movdqu xmm4, XMMWORD PTR [end_ptr] michael@0: movdqu xmm5, XMMWORD PTR [end_ptr + 16] michael@0: movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] michael@0: movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] michael@0: michael@0: lea src_ptr, [src_ptr+src_stride*4] michael@0: michael@0: lea end_ptr, [ref_ptr+ref_stride*2] michael@0: michael@0: movdqa XMMWORD PTR [ref_ptr], xmm0 michael@0: movdqa XMMWORD PTR [ref_ptr + 16], xmm1 michael@0: movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 michael@0: movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 michael@0: movdqa XMMWORD PTR [end_ptr], xmm4 michael@0: movdqa XMMWORD PTR [end_ptr + 16], xmm5 michael@0: movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 michael@0: movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 michael@0: michael@0: lea ref_ptr, [ref_ptr+ref_stride*4] michael@0: michael@0: sub height, 4 michael@0: cmp height, 4 michael@0: jge .block_copy_sse3_loopx4 michael@0: michael@0: ;Check to see if there is more rows need to be copied. michael@0: cmp height, 0 michael@0: je .copy_is_done michael@0: michael@0: .block_copy_sse3_loop: michael@0: movdqu xmm0, XMMWORD PTR [src_ptr] michael@0: movdqu xmm1, XMMWORD PTR [src_ptr + 16] michael@0: lea src_ptr, [src_ptr+src_stride] michael@0: michael@0: movdqa XMMWORD PTR [ref_ptr], xmm0 michael@0: movdqa XMMWORD PTR [ref_ptr + 16], xmm1 michael@0: lea ref_ptr, [ref_ptr+ref_stride] michael@0: michael@0: sub height, 1 michael@0: jne .block_copy_sse3_loop michael@0: michael@0: .copy_is_done: michael@0: STACK_FRAME_DESTROY_X3 michael@0: michael@0: ;void vp8_sad16x16x4d_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr_base, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad16x16x4d_sse3) PRIVATE michael@0: sym(vp8_sad16x16x4d_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X4 michael@0: michael@0: PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbp michael@0: %endif michael@0: mov rcx, result_ptr michael@0: michael@0: movq xmm0, xmm4 michael@0: psrldq xmm4, 8 michael@0: michael@0: paddw xmm0, xmm4 michael@0: movd [rcx], xmm0 michael@0: ;- michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rcx+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rcx+8], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rcx+12], xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X4 michael@0: michael@0: ;void vp8_sad16x8x4d_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr_base, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad16x8x4d_sse3) PRIVATE michael@0: sym(vp8_sad16x8x4d_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X4 michael@0: michael@0: PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbp michael@0: %endif michael@0: mov rcx, result_ptr michael@0: michael@0: movq xmm0, xmm4 michael@0: psrldq xmm4, 8 michael@0: michael@0: paddw xmm0, xmm4 michael@0: movd [rcx], xmm0 michael@0: ;- michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rcx+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rcx+8], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rcx+12], xmm0 michael@0: michael@0: STACK_FRAME_DESTROY_X4 michael@0: michael@0: ;void int vp8_sad8x16x4d_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad8x16x4d_sse3) PRIVATE michael@0: sym(vp8_sad8x16x4d_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X4 michael@0: michael@0: PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbp michael@0: %endif michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm4, mm5 michael@0: punpckldq mm6, mm7 michael@0: michael@0: movq [rcx], mm4 michael@0: movq [rcx+8], mm6 michael@0: michael@0: STACK_FRAME_DESTROY_X4 michael@0: michael@0: ;void int vp8_sad8x8x4d_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad8x8x4d_sse3) PRIVATE michael@0: sym(vp8_sad8x8x4d_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X4 michael@0: michael@0: PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rbp michael@0: %endif michael@0: mov rcx, result_ptr michael@0: michael@0: punpckldq mm4, mm5 michael@0: punpckldq mm6, mm7 michael@0: michael@0: movq [rcx], mm4 michael@0: movq [rcx+8], mm6 michael@0: michael@0: STACK_FRAME_DESTROY_X4 michael@0: michael@0: ;void int vp8_sad4x4x4d_sse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp8_sad4x4x4d_sse3) PRIVATE michael@0: sym(vp8_sad4x4x4d_sse3): michael@0: michael@0: STACK_FRAME_CREATE_X4 michael@0: michael@0: movd mm0, DWORD PTR [src_ptr] michael@0: movd mm1, DWORD PTR [r0_ptr] michael@0: michael@0: movd mm2, DWORD PTR [src_ptr+src_stride] michael@0: movd mm3, DWORD PTR [r0_ptr+ref_stride] michael@0: michael@0: punpcklbw mm0, mm2 michael@0: punpcklbw mm1, mm3 michael@0: michael@0: movd mm4, DWORD PTR [r1_ptr] michael@0: movd mm5, DWORD PTR [r2_ptr] michael@0: michael@0: movd mm6, DWORD PTR [r3_ptr] michael@0: movd mm2, DWORD PTR [r1_ptr+ref_stride] michael@0: michael@0: movd mm3, DWORD PTR [r2_ptr+ref_stride] michael@0: movd mm7, DWORD PTR [r3_ptr+ref_stride] michael@0: michael@0: psadbw mm1, mm0 michael@0: michael@0: punpcklbw mm4, mm2 michael@0: punpcklbw mm5, mm3 michael@0: michael@0: punpcklbw mm6, mm7 michael@0: psadbw mm4, mm0 michael@0: michael@0: psadbw mm5, mm0 michael@0: psadbw mm6, mm0 michael@0: michael@0: michael@0: michael@0: lea src_ptr, [src_ptr+src_stride*2] michael@0: lea r0_ptr, [r0_ptr+ref_stride*2] michael@0: michael@0: lea r1_ptr, [r1_ptr+ref_stride*2] michael@0: lea r2_ptr, [r2_ptr+ref_stride*2] michael@0: michael@0: lea r3_ptr, [r3_ptr+ref_stride*2] michael@0: michael@0: movd mm0, DWORD PTR [src_ptr] michael@0: movd mm2, DWORD PTR [r0_ptr] michael@0: michael@0: movd mm3, DWORD PTR [src_ptr+src_stride] michael@0: movd mm7, DWORD PTR [r0_ptr+ref_stride] michael@0: michael@0: punpcklbw mm0, mm3 michael@0: punpcklbw mm2, mm7 michael@0: michael@0: movd mm3, DWORD PTR [r1_ptr] michael@0: movd mm7, DWORD PTR [r2_ptr] michael@0: michael@0: psadbw mm2, mm0 michael@0: %if ABI_IS_32BIT michael@0: mov rax, rbp michael@0: michael@0: pop rbp michael@0: %define ref_stride rax michael@0: %endif michael@0: mov rsi, result_ptr michael@0: michael@0: paddw mm1, mm2 michael@0: movd [rsi], mm1 michael@0: michael@0: movd mm2, DWORD PTR [r1_ptr+ref_stride] michael@0: movd mm1, DWORD PTR [r2_ptr+ref_stride] michael@0: michael@0: punpcklbw mm3, mm2 michael@0: punpcklbw mm7, mm1 michael@0: michael@0: psadbw mm3, mm0 michael@0: psadbw mm7, mm0 michael@0: michael@0: movd mm2, DWORD PTR [r3_ptr] michael@0: movd mm1, DWORD PTR [r3_ptr+ref_stride] michael@0: michael@0: paddw mm3, mm4 michael@0: paddw mm7, mm5 michael@0: michael@0: movd [rsi+4], mm3 michael@0: punpcklbw mm2, mm1 michael@0: michael@0: movd [rsi+8], mm7 michael@0: psadbw mm2, mm0 michael@0: michael@0: paddw mm2, mm6 michael@0: movd [rsi+12], mm2 michael@0: michael@0: michael@0: STACK_FRAME_DESTROY_X4 michael@0: