michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %macro VERTx4 1 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm4, [rdx] ;load filters michael@0: movd xmm5, rcx michael@0: packsswb xmm4, xmm4 michael@0: pshuflw xmm0, xmm4, 0b ;k0_k1 michael@0: pshuflw xmm1, xmm4, 01010101b ;k2_k3 michael@0: pshuflw xmm2, xmm4, 10101010b ;k4_k5 michael@0: pshuflw xmm3, xmm4, 11111111b ;k6_k7 michael@0: michael@0: punpcklqdq xmm0, xmm0 michael@0: punpcklqdq xmm1, xmm1 michael@0: punpcklqdq xmm2, xmm2 michael@0: punpcklqdq xmm3, xmm3 michael@0: michael@0: movdqa k0k1, xmm0 michael@0: movdqa k2k3, xmm1 michael@0: pshufd xmm5, xmm5, 0 michael@0: movdqa k4k5, xmm2 michael@0: movdqa k6k7, xmm3 michael@0: movdqa krd, xmm5 michael@0: michael@0: movsxd rdx, DWORD PTR arg(1) ;pixels_per_line michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r8, DWORD PTR arg(3) ;out_pitch michael@0: %endif michael@0: mov rax, rsi michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: add rax, rdx michael@0: michael@0: lea rbx, [rdx + rdx*4] michael@0: add rbx, rdx ;pitch * 6 michael@0: michael@0: .loop: michael@0: movd xmm0, [rsi] ;A michael@0: movd xmm1, [rsi + rdx] ;B michael@0: movd xmm2, [rsi + rdx * 2] ;C michael@0: movd xmm3, [rax + rdx * 2] ;D michael@0: movd xmm4, [rsi + rdx * 4] ;E michael@0: movd xmm5, [rax + rdx * 4] ;F michael@0: michael@0: punpcklbw xmm0, xmm1 ;A B michael@0: punpcklbw xmm2, xmm3 ;C D michael@0: punpcklbw xmm4, xmm5 ;E F michael@0: michael@0: movd xmm6, [rsi + rbx] ;G michael@0: movd xmm7, [rax + rbx] ;H michael@0: michael@0: pmaddubsw xmm0, k0k1 michael@0: pmaddubsw xmm2, k2k3 michael@0: punpcklbw xmm6, xmm7 ;G H michael@0: pmaddubsw xmm4, k4k5 michael@0: pmaddubsw xmm6, k6k7 michael@0: michael@0: movdqa xmm1, xmm2 michael@0: paddsw xmm0, xmm6 michael@0: pmaxsw xmm2, xmm4 michael@0: pminsw xmm4, xmm1 michael@0: paddsw xmm0, xmm4 michael@0: paddsw xmm0, xmm2 michael@0: michael@0: paddsw xmm0, krd michael@0: psraw xmm0, 7 michael@0: packuswb xmm0, xmm0 michael@0: michael@0: add rsi, rdx michael@0: add rax, rdx michael@0: %if %1 michael@0: movd xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movd [rdi], xmm0 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rdi, DWORD PTR arg(3) ;out_pitch michael@0: %else michael@0: add rdi, r8 michael@0: %endif michael@0: dec rcx michael@0: jnz .loop michael@0: %endm michael@0: michael@0: %macro VERTx8 1 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm4, [rdx] ;load filters michael@0: movq xmm5, rcx michael@0: packsswb xmm4, xmm4 michael@0: pshuflw xmm0, xmm4, 0b ;k0_k1 michael@0: pshuflw xmm1, xmm4, 01010101b ;k2_k3 michael@0: pshuflw xmm2, xmm4, 10101010b ;k4_k5 michael@0: pshuflw xmm3, xmm4, 11111111b ;k6_k7 michael@0: michael@0: punpcklqdq xmm0, xmm0 michael@0: punpcklqdq xmm1, xmm1 michael@0: punpcklqdq xmm2, xmm2 michael@0: punpcklqdq xmm3, xmm3 michael@0: michael@0: movdqa k0k1, xmm0 michael@0: movdqa k2k3, xmm1 michael@0: pshufd xmm5, xmm5, 0 michael@0: movdqa k4k5, xmm2 michael@0: movdqa k6k7, xmm3 michael@0: movdqa krd, xmm5 michael@0: michael@0: movsxd rdx, DWORD PTR arg(1) ;pixels_per_line michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r8, DWORD PTR arg(3) ;out_pitch michael@0: %endif michael@0: mov rax, rsi michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: add rax, rdx michael@0: michael@0: lea rbx, [rdx + rdx*4] michael@0: add rbx, rdx ;pitch * 6 michael@0: michael@0: .loop: michael@0: movq xmm0, [rsi] ;A michael@0: movq xmm1, [rsi + rdx] ;B michael@0: movq xmm2, [rsi + rdx * 2] ;C michael@0: movq xmm3, [rax + rdx * 2] ;D michael@0: movq xmm4, [rsi + rdx * 4] ;E michael@0: movq xmm5, [rax + rdx * 4] ;F michael@0: michael@0: punpcklbw xmm0, xmm1 ;A B michael@0: punpcklbw xmm2, xmm3 ;C D michael@0: punpcklbw xmm4, xmm5 ;E F michael@0: michael@0: movq xmm6, [rsi + rbx] ;G michael@0: movq xmm7, [rax + rbx] ;H michael@0: michael@0: pmaddubsw xmm0, k0k1 michael@0: pmaddubsw xmm2, k2k3 michael@0: punpcklbw xmm6, xmm7 ;G H michael@0: pmaddubsw xmm4, k4k5 michael@0: pmaddubsw xmm6, k6k7 michael@0: michael@0: paddsw xmm0, xmm6 michael@0: movdqa xmm1, xmm2 michael@0: pmaxsw xmm2, xmm4 michael@0: pminsw xmm4, xmm1 michael@0: paddsw xmm0, xmm4 michael@0: paddsw xmm0, xmm2 michael@0: michael@0: paddsw xmm0, krd michael@0: psraw xmm0, 7 michael@0: packuswb xmm0, xmm0 michael@0: michael@0: add rsi, rdx michael@0: add rax, rdx michael@0: %if %1 michael@0: movq xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movq [rdi], xmm0 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rdi, DWORD PTR arg(3) ;out_pitch michael@0: %else michael@0: add rdi, r8 michael@0: %endif michael@0: dec rcx michael@0: jnz .loop michael@0: %endm michael@0: michael@0: michael@0: %macro VERTx16 1 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm4, [rdx] ;load filters michael@0: movq xmm5, rcx michael@0: packsswb xmm4, xmm4 michael@0: pshuflw xmm0, xmm4, 0b ;k0_k1 michael@0: pshuflw xmm1, xmm4, 01010101b ;k2_k3 michael@0: pshuflw xmm2, xmm4, 10101010b ;k4_k5 michael@0: pshuflw xmm3, xmm4, 11111111b ;k6_k7 michael@0: michael@0: punpcklqdq xmm0, xmm0 michael@0: punpcklqdq xmm1, xmm1 michael@0: punpcklqdq xmm2, xmm2 michael@0: punpcklqdq xmm3, xmm3 michael@0: michael@0: movdqa k0k1, xmm0 michael@0: movdqa k2k3, xmm1 michael@0: pshufd xmm5, xmm5, 0 michael@0: movdqa k4k5, xmm2 michael@0: movdqa k6k7, xmm3 michael@0: movdqa krd, xmm5 michael@0: michael@0: movsxd rdx, DWORD PTR arg(1) ;pixels_per_line michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r8, DWORD PTR arg(3) ;out_pitch michael@0: %endif michael@0: mov rax, rsi michael@0: movsxd rcx, DWORD PTR arg(4) ;output_height michael@0: add rax, rdx michael@0: michael@0: lea rbx, [rdx + rdx*4] michael@0: add rbx, rdx ;pitch * 6 michael@0: michael@0: .loop: michael@0: movq xmm0, [rsi] ;A michael@0: movq xmm1, [rsi + rdx] ;B michael@0: movq xmm2, [rsi + rdx * 2] ;C michael@0: movq xmm3, [rax + rdx * 2] ;D michael@0: movq xmm4, [rsi + rdx * 4] ;E michael@0: movq xmm5, [rax + rdx * 4] ;F michael@0: michael@0: punpcklbw xmm0, xmm1 ;A B michael@0: punpcklbw xmm2, xmm3 ;C D michael@0: punpcklbw xmm4, xmm5 ;E F michael@0: michael@0: movq xmm6, [rsi + rbx] ;G michael@0: movq xmm7, [rax + rbx] ;H michael@0: michael@0: pmaddubsw xmm0, k0k1 michael@0: pmaddubsw xmm2, k2k3 michael@0: punpcklbw xmm6, xmm7 ;G H michael@0: pmaddubsw xmm4, k4k5 michael@0: pmaddubsw xmm6, k6k7 michael@0: michael@0: paddsw xmm0, xmm6 michael@0: movdqa xmm1, xmm2 michael@0: pmaxsw xmm2, xmm4 michael@0: pminsw xmm4, xmm1 michael@0: paddsw xmm0, xmm4 michael@0: paddsw xmm0, xmm2 michael@0: michael@0: paddsw xmm0, krd michael@0: psraw xmm0, 7 michael@0: packuswb xmm0, xmm0 michael@0: %if %1 michael@0: movq xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movq [rdi], xmm0 michael@0: michael@0: movq xmm0, [rsi + 8] ;A michael@0: movq xmm1, [rsi + rdx + 8] ;B michael@0: movq xmm2, [rsi + rdx * 2 + 8] ;C michael@0: movq xmm3, [rax + rdx * 2 + 8] ;D michael@0: movq xmm4, [rsi + rdx * 4 + 8] ;E michael@0: movq xmm5, [rax + rdx * 4 + 8] ;F michael@0: michael@0: punpcklbw xmm0, xmm1 ;A B michael@0: punpcklbw xmm2, xmm3 ;C D michael@0: punpcklbw xmm4, xmm5 ;E F michael@0: michael@0: michael@0: movq xmm6, [rsi + rbx + 8] ;G michael@0: movq xmm7, [rax + rbx + 8] ;H michael@0: punpcklbw xmm6, xmm7 ;G H michael@0: michael@0: michael@0: pmaddubsw xmm0, k0k1 michael@0: pmaddubsw xmm2, k2k3 michael@0: pmaddubsw xmm4, k4k5 michael@0: pmaddubsw xmm6, k6k7 michael@0: michael@0: paddsw xmm0, xmm6 michael@0: paddsw xmm0, xmm2 michael@0: paddsw xmm0, xmm4 michael@0: paddsw xmm0, krd michael@0: michael@0: psraw xmm0, 7 michael@0: packuswb xmm0, xmm0 michael@0: michael@0: add rsi, rdx michael@0: add rax, rdx michael@0: %if %1 michael@0: movq xmm1, [rdi+8] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: michael@0: movq [rdi+8], xmm0 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rdi, DWORD PTR arg(3) ;out_pitch michael@0: %else michael@0: add rdi, r8 michael@0: %endif michael@0: dec rcx michael@0: jnz .loop michael@0: %endm michael@0: michael@0: ;void vp9_filter_block1d8_v8_ssse3 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pitch, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int out_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d4_v8_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: VERTx4 0 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d8_v8_ssse3 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pitch, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int out_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d8_v8_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: VERTx8 0 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d16_v8_ssse3 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pitch, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int out_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d16_v8_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: VERTx16 0 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ michael@0: michael@0: michael@0: global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d4_v8_avg_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: VERTx4 1 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d8_v8_avg_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: VERTx8 1 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d16_v8_avg_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: VERTx16 1 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ michael@0: %macro HORIZx4_ROW 2 michael@0: movdqa %2, %1 michael@0: pshufb %1, [GLOBAL(shuf_t0t1)] michael@0: pshufb %2, [GLOBAL(shuf_t2t3)] michael@0: pmaddubsw %1, k0k1k4k5 michael@0: pmaddubsw %2, k2k3k6k7 michael@0: michael@0: movdqa xmm4, %1 michael@0: movdqa xmm5, %2 michael@0: psrldq %1, 8 michael@0: psrldq %2, 8 michael@0: movdqa xmm6, xmm5 michael@0: michael@0: paddsw xmm4, %2 michael@0: pmaxsw xmm5, %1 michael@0: pminsw %1, xmm6 michael@0: paddsw %1, xmm4 michael@0: paddsw %1, xmm5 michael@0: michael@0: paddsw %1, krd michael@0: psraw %1, 7 michael@0: packuswb %1, %1 michael@0: %endm michael@0: michael@0: %macro HORIZx4 1 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm4, [rdx] ;load filters michael@0: movq xmm5, rcx michael@0: packsswb xmm4, xmm4 michael@0: pshuflw xmm6, xmm4, 0b ;k0_k1 michael@0: pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 michael@0: pshuflw xmm7, xmm4, 01010101b ;k2_k3 michael@0: pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 michael@0: pshufd xmm5, xmm5, 0 ;rounding michael@0: michael@0: movdqa k0k1k4k5, xmm6 michael@0: movdqa k2k3k6k7, xmm7 michael@0: movdqa krd, xmm5 michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;output_pitch michael@0: movsxd rcx, dword ptr arg(4) ;output_height michael@0: shr rcx, 1 michael@0: .loop: michael@0: ;Do two rows once michael@0: movq xmm0, [rsi - 3] ;load src michael@0: movq xmm1, [rsi + 5] michael@0: movq xmm2, [rsi + rax - 3] michael@0: movq xmm3, [rsi + rax + 5] michael@0: punpcklqdq xmm0, xmm1 michael@0: punpcklqdq xmm2, xmm3 michael@0: michael@0: HORIZx4_ROW xmm0, xmm1 michael@0: HORIZx4_ROW xmm2, xmm3 michael@0: %if %1 michael@0: movd xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: movd xmm3, [rdi + rdx] michael@0: pavgb xmm2, xmm3 michael@0: %endif michael@0: movd [rdi], xmm0 michael@0: movd [rdi +rdx], xmm2 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: prefetcht0 [rsi + 4 * rax - 3] michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + 2 * rdx] michael@0: prefetcht0 [rsi + 2 * rax - 3] michael@0: michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: ; Do last row if output_height is odd michael@0: movsxd rcx, dword ptr arg(4) ;output_height michael@0: and rcx, 1 michael@0: je .done michael@0: michael@0: movq xmm0, [rsi - 3] ; load src michael@0: movq xmm1, [rsi + 5] michael@0: punpcklqdq xmm0, xmm1 michael@0: michael@0: HORIZx4_ROW xmm0, xmm1 michael@0: %if %1 michael@0: movd xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movd [rdi], xmm0 michael@0: .done michael@0: %endm michael@0: michael@0: %macro HORIZx8_ROW 4 michael@0: movdqa %2, %1 michael@0: movdqa %3, %1 michael@0: movdqa %4, %1 michael@0: michael@0: pshufb %1, [GLOBAL(shuf_t0t1)] michael@0: pshufb %2, [GLOBAL(shuf_t2t3)] michael@0: pshufb %3, [GLOBAL(shuf_t4t5)] michael@0: pshufb %4, [GLOBAL(shuf_t6t7)] michael@0: michael@0: pmaddubsw %1, k0k1 michael@0: pmaddubsw %2, k2k3 michael@0: pmaddubsw %3, k4k5 michael@0: pmaddubsw %4, k6k7 michael@0: michael@0: paddsw %1, %4 michael@0: movdqa %4, %2 michael@0: pmaxsw %2, %3 michael@0: pminsw %3, %4 michael@0: paddsw %1, %3 michael@0: paddsw %1, %2 michael@0: michael@0: paddsw %1, krd michael@0: psraw %1, 7 michael@0: packuswb %1, %1 michael@0: %endm michael@0: michael@0: %macro HORIZx8 1 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm4, [rdx] ;load filters michael@0: movd xmm5, rcx michael@0: packsswb xmm4, xmm4 michael@0: pshuflw xmm0, xmm4, 0b ;k0_k1 michael@0: pshuflw xmm1, xmm4, 01010101b ;k2_k3 michael@0: pshuflw xmm2, xmm4, 10101010b ;k4_k5 michael@0: pshuflw xmm3, xmm4, 11111111b ;k6_k7 michael@0: michael@0: punpcklqdq xmm0, xmm0 michael@0: punpcklqdq xmm1, xmm1 michael@0: punpcklqdq xmm2, xmm2 michael@0: punpcklqdq xmm3, xmm3 michael@0: michael@0: movdqa k0k1, xmm0 michael@0: movdqa k2k3, xmm1 michael@0: pshufd xmm5, xmm5, 0 michael@0: movdqa k4k5, xmm2 michael@0: movdqa k6k7, xmm3 michael@0: movdqa krd, xmm5 michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;output_pitch michael@0: movsxd rcx, dword ptr arg(4) ;output_height michael@0: shr rcx, 1 michael@0: michael@0: .loop: michael@0: movq xmm0, [rsi - 3] ;load src michael@0: movq xmm3, [rsi + 5] michael@0: movq xmm4, [rsi + rax - 3] michael@0: movq xmm7, [rsi + rax + 5] michael@0: punpcklqdq xmm0, xmm3 michael@0: punpcklqdq xmm4, xmm7 michael@0: michael@0: HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 michael@0: HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 michael@0: %if %1 michael@0: movq xmm1, [rdi] michael@0: movq xmm2, [rdi + rdx] michael@0: pavgb xmm0, xmm1 michael@0: pavgb xmm4, xmm2 michael@0: %endif michael@0: movq [rdi], xmm0 michael@0: movq [rdi + rdx], xmm4 michael@0: michael@0: lea rsi, [rsi + rax] michael@0: prefetcht0 [rsi + 4 * rax - 3] michael@0: lea rsi, [rsi + rax] michael@0: lea rdi, [rdi + 2 * rdx] michael@0: prefetcht0 [rsi + 2 * rax - 3] michael@0: dec rcx michael@0: jnz .loop michael@0: michael@0: ;Do last row if output_height is odd michael@0: movsxd rcx, dword ptr arg(4) ;output_height michael@0: and rcx, 1 michael@0: je .done michael@0: michael@0: movq xmm0, [rsi - 3] michael@0: movq xmm3, [rsi + 5] michael@0: punpcklqdq xmm0, xmm3 michael@0: michael@0: HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 michael@0: %if %1 michael@0: movq xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: movq [rdi], xmm0 michael@0: .done michael@0: %endm michael@0: michael@0: %macro HORIZx16 1 michael@0: mov rdx, arg(5) ;filter ptr michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(2) ;output_ptr michael@0: mov rcx, 0x0400040 michael@0: michael@0: movdqa xmm4, [rdx] ;load filters michael@0: movq xmm5, rcx michael@0: packsswb xmm4, xmm4 michael@0: pshuflw xmm0, xmm4, 0b ;k0_k1 michael@0: pshuflw xmm1, xmm4, 01010101b ;k2_k3 michael@0: pshuflw xmm2, xmm4, 10101010b ;k4_k5 michael@0: pshuflw xmm3, xmm4, 11111111b ;k6_k7 michael@0: michael@0: punpcklqdq xmm0, xmm0 michael@0: punpcklqdq xmm1, xmm1 michael@0: punpcklqdq xmm2, xmm2 michael@0: punpcklqdq xmm3, xmm3 michael@0: michael@0: movdqa k0k1, xmm0 michael@0: movdqa k2k3, xmm1 michael@0: pshufd xmm5, xmm5, 0 michael@0: movdqa k4k5, xmm2 michael@0: movdqa k6k7, xmm3 michael@0: movdqa krd, xmm5 michael@0: michael@0: movsxd rax, dword ptr arg(1) ;src_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;output_pitch michael@0: movsxd rcx, dword ptr arg(4) ;output_height michael@0: michael@0: .loop: michael@0: prefetcht0 [rsi + 2 * rax -3] michael@0: michael@0: movq xmm0, [rsi - 3] ;load src data michael@0: movq xmm4, [rsi + 5] michael@0: movq xmm7, [rsi + 13] michael@0: punpcklqdq xmm0, xmm4 michael@0: punpcklqdq xmm4, xmm7 michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm0 michael@0: movdqa xmm5, xmm4 michael@0: movdqa xmm6, xmm4 michael@0: movdqa xmm7, xmm4 michael@0: michael@0: pshufb xmm0, [GLOBAL(shuf_t0t1)] michael@0: pshufb xmm1, [GLOBAL(shuf_t2t3)] michael@0: pshufb xmm2, [GLOBAL(shuf_t4t5)] michael@0: pshufb xmm3, [GLOBAL(shuf_t6t7)] michael@0: pshufb xmm4, [GLOBAL(shuf_t0t1)] michael@0: pshufb xmm5, [GLOBAL(shuf_t2t3)] michael@0: pshufb xmm6, [GLOBAL(shuf_t4t5)] michael@0: pshufb xmm7, [GLOBAL(shuf_t6t7)] michael@0: michael@0: pmaddubsw xmm0, k0k1 michael@0: pmaddubsw xmm1, k2k3 michael@0: pmaddubsw xmm2, k4k5 michael@0: pmaddubsw xmm3, k6k7 michael@0: pmaddubsw xmm4, k0k1 michael@0: pmaddubsw xmm5, k2k3 michael@0: pmaddubsw xmm6, k4k5 michael@0: pmaddubsw xmm7, k6k7 michael@0: michael@0: paddsw xmm0, xmm3 michael@0: movdqa xmm3, xmm1 michael@0: pmaxsw xmm1, xmm2 michael@0: pminsw xmm2, xmm3 michael@0: paddsw xmm0, xmm2 michael@0: paddsw xmm0, xmm1 michael@0: michael@0: paddsw xmm4, xmm7 michael@0: movdqa xmm7, xmm5 michael@0: pmaxsw xmm5, xmm6 michael@0: pminsw xmm6, xmm7 michael@0: paddsw xmm4, xmm6 michael@0: paddsw xmm4, xmm5 michael@0: michael@0: paddsw xmm0, krd michael@0: paddsw xmm4, krd michael@0: psraw xmm0, 7 michael@0: psraw xmm4, 7 michael@0: packuswb xmm0, xmm0 michael@0: packuswb xmm4, xmm4 michael@0: punpcklqdq xmm0, xmm4 michael@0: %if %1 michael@0: movdqa xmm1, [rdi] michael@0: pavgb xmm0, xmm1 michael@0: %endif michael@0: michael@0: lea rsi, [rsi + rax] michael@0: movdqa [rdi], xmm0 michael@0: michael@0: lea rdi, [rdi + rdx] michael@0: dec rcx michael@0: jnz .loop michael@0: %endm michael@0: michael@0: ;void vp9_filter_block1d4_h8_ssse3 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int output_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d4_h8_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 3 michael@0: %define k0k1k4k5 [rsp + 16 * 0] michael@0: %define k2k3k6k7 [rsp + 16 * 1] michael@0: %define krd [rsp + 16 * 2] michael@0: michael@0: HORIZx4 0 michael@0: michael@0: add rsp, 16 * 3 michael@0: pop rsp michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d8_h8_ssse3 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int output_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d8_h8_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: HORIZx8 0 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp9_filter_block1d16_h8_ssse3 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned int src_pixels_per_line, michael@0: ; unsigned char *output_ptr, michael@0: ; unsigned int output_pitch, michael@0: ; unsigned int output_height, michael@0: ; short *filter michael@0: ;) michael@0: global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d16_h8_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: HORIZx16 0 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d4_h8_avg_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 * 3 michael@0: %define k0k1k4k5 [rsp + 16 * 0] michael@0: %define k2k3k6k7 [rsp + 16 * 1] michael@0: %define krd [rsp + 16 * 2] michael@0: michael@0: HORIZx4 1 michael@0: michael@0: add rsp, 16 * 3 michael@0: pop rsp michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d8_h8_avg_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: HORIZx8 1 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE michael@0: sym(vp9_filter_block1d16_h8_avg_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16*5 michael@0: %define k0k1 [rsp + 16*0] michael@0: %define k2k3 [rsp + 16*1] michael@0: %define k4k5 [rsp + 16*2] michael@0: %define k6k7 [rsp + 16*3] michael@0: %define krd [rsp + 16*4] michael@0: michael@0: HORIZx16 1 michael@0: michael@0: add rsp, 16*5 michael@0: pop rsp michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: SECTION_RODATA michael@0: align 16 michael@0: shuf_t0t1: michael@0: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 michael@0: align 16 michael@0: shuf_t2t3: michael@0: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 michael@0: align 16 michael@0: shuf_t4t5: michael@0: db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 michael@0: align 16 michael@0: shuf_t6t7: michael@0: db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14