michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %macro PROCESS_16X2X3 1 michael@0: %if %1 michael@0: movdqa xmm0, XMMWORD PTR [rsi] michael@0: lddqu xmm5, XMMWORD PTR [rdi] michael@0: lddqu xmm6, XMMWORD PTR [rdi+1] michael@0: lddqu xmm7, XMMWORD PTR [rdi+2] michael@0: michael@0: psadbw xmm5, xmm0 michael@0: psadbw xmm6, xmm0 michael@0: psadbw xmm7, xmm0 michael@0: %else michael@0: movdqa xmm0, XMMWORD PTR [rsi] michael@0: lddqu xmm1, XMMWORD PTR [rdi] michael@0: lddqu xmm2, XMMWORD PTR [rdi+1] michael@0: lddqu xmm3, XMMWORD PTR [rdi+2] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endif michael@0: movdqa xmm0, XMMWORD PTR [rsi+rax] michael@0: lddqu xmm1, XMMWORD PTR [rdi+rdx] michael@0: lddqu xmm2, XMMWORD PTR [rdi+rdx+1] michael@0: lddqu xmm3, XMMWORD PTR [rdi+rdx+2] michael@0: michael@0: lea rsi, [rsi+rax*2] michael@0: lea rdi, [rdi+rdx*2] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endmacro michael@0: michael@0: %macro PROCESS_16X2X3_OFFSET 2 michael@0: %if %1 michael@0: movdqa xmm0, XMMWORD PTR [rsi] michael@0: movdqa xmm4, XMMWORD PTR [rdi] michael@0: movdqa xmm7, XMMWORD PTR [rdi+16] michael@0: michael@0: movdqa xmm5, xmm7 michael@0: palignr xmm5, xmm4, %2 michael@0: michael@0: movdqa xmm6, xmm7 michael@0: palignr xmm6, xmm4, (%2+1) michael@0: michael@0: palignr xmm7, xmm4, (%2+2) michael@0: michael@0: psadbw xmm5, xmm0 michael@0: psadbw xmm6, xmm0 michael@0: psadbw xmm7, xmm0 michael@0: %else michael@0: movdqa xmm0, XMMWORD PTR [rsi] michael@0: movdqa xmm4, XMMWORD PTR [rdi] michael@0: movdqa xmm3, XMMWORD PTR [rdi+16] michael@0: michael@0: movdqa xmm1, xmm3 michael@0: palignr xmm1, xmm4, %2 michael@0: michael@0: movdqa xmm2, xmm3 michael@0: palignr xmm2, xmm4, (%2+1) michael@0: michael@0: palignr xmm3, xmm4, (%2+2) michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endif michael@0: movdqa xmm0, XMMWORD PTR [rsi+rax] michael@0: movdqa xmm4, XMMWORD PTR [rdi+rdx] michael@0: movdqa xmm3, XMMWORD PTR [rdi+rdx+16] michael@0: michael@0: movdqa xmm1, xmm3 michael@0: palignr xmm1, xmm4, %2 michael@0: michael@0: movdqa xmm2, xmm3 michael@0: palignr xmm2, xmm4, (%2+1) michael@0: michael@0: palignr xmm3, xmm4, (%2+2) michael@0: michael@0: lea rsi, [rsi+rax*2] michael@0: lea rdi, [rdi+rdx*2] michael@0: michael@0: psadbw xmm1, xmm0 michael@0: psadbw xmm2, xmm0 michael@0: psadbw xmm3, xmm0 michael@0: michael@0: paddw xmm5, xmm1 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm7, xmm3 michael@0: %endmacro michael@0: michael@0: %macro PROCESS_16X16X3_OFFSET 2 michael@0: %2_aligned_by_%1: michael@0: michael@0: sub rdi, %1 michael@0: michael@0: PROCESS_16X2X3_OFFSET 1, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: michael@0: jmp %2_store_off michael@0: michael@0: %endmacro michael@0: michael@0: %macro PROCESS_16X8X3_OFFSET 2 michael@0: %2_aligned_by_%1: michael@0: michael@0: sub rdi, %1 michael@0: michael@0: PROCESS_16X2X3_OFFSET 1, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: PROCESS_16X2X3_OFFSET 0, %1 michael@0: michael@0: jmp %2_store_off michael@0: michael@0: %endmacro michael@0: michael@0: ;void int vp9_sad16x16x3_ssse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad16x16x3_ssse3) PRIVATE michael@0: sym(vp9_sad16x16x3_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rcx michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: mov rdx, 0xf michael@0: and rdx, rdi michael@0: michael@0: jmp .vp9_sad16x16x3_ssse3_skiptable michael@0: .vp9_sad16x16x3_ssse3_jumptable: michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_0 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_1 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_2 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_3 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_4 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_5 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_6 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_7 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_8 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_9 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_10 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_11 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_12 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_13 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_14 - .vp9_sad16x16x3_ssse3_do_jump michael@0: dd .vp9_sad16x16x3_ssse3_aligned_by_15 - .vp9_sad16x16x3_ssse3_do_jump michael@0: .vp9_sad16x16x3_ssse3_skiptable: michael@0: michael@0: call .vp9_sad16x16x3_ssse3_do_jump michael@0: .vp9_sad16x16x3_ssse3_do_jump: michael@0: pop rcx ; get the address of do_jump michael@0: mov rax, .vp9_sad16x16x3_ssse3_jumptable - .vp9_sad16x16x3_ssse3_do_jump michael@0: add rax, rcx ; get the absolute address of vp9_sad16x16x3_ssse3_jumptable michael@0: michael@0: movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable michael@0: add rcx, rax michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: jmp rcx michael@0: michael@0: PROCESS_16X16X3_OFFSET 0, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 1, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 2, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 3, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 4, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 5, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 6, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 7, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 8, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 9, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 10, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 11, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 12, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 13, .vp9_sad16x16x3_ssse3 michael@0: PROCESS_16X16X3_OFFSET 14, .vp9_sad16x16x3_ssse3 michael@0: michael@0: .vp9_sad16x16x3_ssse3_aligned_by_15: michael@0: PROCESS_16X2X3 1 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: michael@0: .vp9_sad16x16x3_ssse3_store_off: michael@0: mov rdi, arg(4) ;Results michael@0: michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rdi], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rdi+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rdi+8], xmm0 michael@0: michael@0: ; begin epilog michael@0: pop rcx michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void int vp9_sad16x8x3_ssse3( michael@0: ; unsigned char *src_ptr, michael@0: ; int src_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_stride, michael@0: ; int *results) michael@0: global sym(vp9_sad16x8x3_ssse3) PRIVATE michael@0: sym(vp9_sad16x8x3_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rcx michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;ref_ptr michael@0: michael@0: mov rdx, 0xf michael@0: and rdx, rdi michael@0: michael@0: jmp .vp9_sad16x8x3_ssse3_skiptable michael@0: .vp9_sad16x8x3_ssse3_jumptable: michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_0 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_1 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_2 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_3 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_4 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_5 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_6 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_7 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_8 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_9 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_10 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_11 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_12 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_13 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_14 - .vp9_sad16x8x3_ssse3_do_jump michael@0: dd .vp9_sad16x8x3_ssse3_aligned_by_15 - .vp9_sad16x8x3_ssse3_do_jump michael@0: .vp9_sad16x8x3_ssse3_skiptable: michael@0: michael@0: call .vp9_sad16x8x3_ssse3_do_jump michael@0: .vp9_sad16x8x3_ssse3_do_jump: michael@0: pop rcx ; get the address of do_jump michael@0: mov rax, .vp9_sad16x8x3_ssse3_jumptable - .vp9_sad16x8x3_ssse3_do_jump michael@0: add rax, rcx ; get the absolute address of vp9_sad16x8x3_ssse3_jumptable michael@0: michael@0: movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable michael@0: add rcx, rax michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_stride michael@0: movsxd rdx, dword ptr arg(3) ;ref_stride michael@0: michael@0: jmp rcx michael@0: michael@0: PROCESS_16X8X3_OFFSET 0, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 1, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 2, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 3, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 4, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 5, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 6, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 7, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 8, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 9, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 10, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 11, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 12, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 13, .vp9_sad16x8x3_ssse3 michael@0: PROCESS_16X8X3_OFFSET 14, .vp9_sad16x8x3_ssse3 michael@0: michael@0: .vp9_sad16x8x3_ssse3_aligned_by_15: michael@0: michael@0: PROCESS_16X2X3 1 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: PROCESS_16X2X3 0 michael@0: michael@0: .vp9_sad16x8x3_ssse3_store_off: michael@0: mov rdi, arg(4) ;Results michael@0: michael@0: movq xmm0, xmm5 michael@0: psrldq xmm5, 8 michael@0: michael@0: paddw xmm0, xmm5 michael@0: movd [rdi], xmm0 michael@0: ;- michael@0: movq xmm0, xmm6 michael@0: psrldq xmm6, 8 michael@0: michael@0: paddw xmm0, xmm6 michael@0: movd [rdi+4], xmm0 michael@0: ;- michael@0: movq xmm0, xmm7 michael@0: psrldq xmm7, 8 michael@0: michael@0: paddw xmm0, xmm7 michael@0: movd [rdi+8], xmm0 michael@0: michael@0: ; begin epilog michael@0: pop rcx michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret