michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %define xmm_filter_shift 7 michael@0: michael@0: michael@0: ;void vp8_filter_block2d_bil_var_ssse3 michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; int xoffset, michael@0: ; int yoffset, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared;; michael@0: ; michael@0: ;) michael@0: ;Note: The filter coefficient at offset=0 is 128. Since the second register michael@0: ;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. michael@0: global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE michael@0: sym(vp8_filter_block2d_bil_var_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 9 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: pxor xmm6, xmm6 michael@0: pxor xmm7, xmm7 michael@0: michael@0: lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] michael@0: movsxd rax, dword ptr arg(5) ; xoffset michael@0: michael@0: cmp rax, 0 ; skip first_pass filter if xoffset=0 michael@0: je .filter_block2d_bil_var_ssse3_sp_only michael@0: michael@0: shl rax, 4 ; point to filter coeff with xoffset michael@0: lea rax, [rax + rcx] ; HFilter michael@0: michael@0: movsxd rdx, dword ptr arg(6) ; yoffset michael@0: michael@0: cmp rdx, 0 ; skip second_pass filter if yoffset=0 michael@0: je .filter_block2d_bil_var_ssse3_fp_only michael@0: michael@0: shl rdx, 4 michael@0: lea rdx, [rdx + rcx] ; VFilter michael@0: michael@0: mov rsi, arg(0) ;ref_ptr michael@0: mov rdi, arg(2) ;src_ptr michael@0: movsxd rcx, dword ptr arg(4) ;Height michael@0: michael@0: movdqu xmm0, XMMWORD PTR [rsi] michael@0: movdqu xmm1, XMMWORD PTR [rsi+1] michael@0: movdqa xmm2, xmm0 michael@0: michael@0: punpcklbw xmm0, xmm1 michael@0: punpckhbw xmm2, xmm1 michael@0: pmaddubsw xmm0, [rax] michael@0: pmaddubsw xmm2, [rax] michael@0: michael@0: paddw xmm0, [GLOBAL(xmm_bi_rd)] michael@0: paddw xmm2, [GLOBAL(xmm_bi_rd)] michael@0: psraw xmm0, xmm_filter_shift michael@0: psraw xmm2, xmm_filter_shift michael@0: michael@0: packuswb xmm0, xmm2 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line michael@0: %else michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: lea rsi, [rsi + r8] michael@0: %endif michael@0: michael@0: .filter_block2d_bil_var_ssse3_loop: michael@0: movdqu xmm1, XMMWORD PTR [rsi] michael@0: movdqu xmm2, XMMWORD PTR [rsi+1] michael@0: movdqa xmm3, xmm1 michael@0: michael@0: punpcklbw xmm1, xmm2 michael@0: punpckhbw xmm3, xmm2 michael@0: pmaddubsw xmm1, [rax] michael@0: pmaddubsw xmm3, [rax] michael@0: michael@0: paddw xmm1, [GLOBAL(xmm_bi_rd)] michael@0: paddw xmm3, [GLOBAL(xmm_bi_rd)] michael@0: psraw xmm1, xmm_filter_shift michael@0: psraw xmm3, xmm_filter_shift michael@0: packuswb xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm0, xmm1 michael@0: movdqa xmm3, xmm2 michael@0: michael@0: punpcklbw xmm2, xmm1 michael@0: punpckhbw xmm3, xmm1 michael@0: pmaddubsw xmm2, [rdx] michael@0: pmaddubsw xmm3, [rdx] michael@0: michael@0: paddw xmm2, [GLOBAL(xmm_bi_rd)] michael@0: paddw xmm3, [GLOBAL(xmm_bi_rd)] michael@0: psraw xmm2, xmm_filter_shift michael@0: psraw xmm3, xmm_filter_shift michael@0: michael@0: movq xmm1, QWORD PTR [rdi] michael@0: pxor xmm4, xmm4 michael@0: punpcklbw xmm1, xmm4 michael@0: movq xmm5, QWORD PTR [rdi+8] michael@0: punpcklbw xmm5, xmm4 michael@0: michael@0: psubw xmm2, xmm1 michael@0: psubw xmm3, xmm5 michael@0: paddw xmm6, xmm2 michael@0: paddw xmm6, xmm3 michael@0: pmaddwd xmm2, xmm2 michael@0: pmaddwd xmm3, xmm3 michael@0: paddd xmm7, xmm2 michael@0: paddd xmm7, xmm3 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line michael@0: add rdi, dword ptr arg(3) ;src_pixels_per_line michael@0: %else michael@0: lea rsi, [rsi + r8] michael@0: lea rdi, [rdi + r9] michael@0: %endif michael@0: michael@0: sub rcx, 1 michael@0: jnz .filter_block2d_bil_var_ssse3_loop michael@0: michael@0: jmp .filter_block2d_bil_variance michael@0: michael@0: .filter_block2d_bil_var_ssse3_sp_only: michael@0: movsxd rdx, dword ptr arg(6) ; yoffset michael@0: michael@0: cmp rdx, 0 ; Both xoffset =0 and yoffset=0 michael@0: je .filter_block2d_bil_var_ssse3_full_pixel michael@0: michael@0: shl rdx, 4 michael@0: lea rdx, [rdx + rcx] ; VFilter michael@0: michael@0: mov rsi, arg(0) ;ref_ptr michael@0: mov rdi, arg(2) ;src_ptr michael@0: movsxd rcx, dword ptr arg(4) ;Height michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: michael@0: movdqu xmm1, XMMWORD PTR [rsi] michael@0: movdqa xmm0, xmm1 michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: %endif michael@0: michael@0: lea rsi, [rsi + rax] michael@0: michael@0: .filter_block2d_bil_sp_only_loop: michael@0: movdqu xmm3, XMMWORD PTR [rsi] michael@0: movdqa xmm2, xmm1 michael@0: movdqa xmm0, xmm3 michael@0: michael@0: punpcklbw xmm1, xmm3 michael@0: punpckhbw xmm2, xmm3 michael@0: pmaddubsw xmm1, [rdx] michael@0: pmaddubsw xmm2, [rdx] michael@0: michael@0: paddw xmm1, [GLOBAL(xmm_bi_rd)] michael@0: paddw xmm2, [GLOBAL(xmm_bi_rd)] michael@0: psraw xmm1, xmm_filter_shift michael@0: psraw xmm2, xmm_filter_shift michael@0: michael@0: movq xmm3, QWORD PTR [rdi] michael@0: pxor xmm4, xmm4 michael@0: punpcklbw xmm3, xmm4 michael@0: movq xmm5, QWORD PTR [rdi+8] michael@0: punpcklbw xmm5, xmm4 michael@0: michael@0: psubw xmm1, xmm3 michael@0: psubw xmm2, xmm5 michael@0: paddw xmm6, xmm1 michael@0: paddw xmm6, xmm2 michael@0: pmaddwd xmm1, xmm1 michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm7, xmm1 michael@0: paddd xmm7, xmm2 michael@0: michael@0: movdqa xmm1, xmm0 michael@0: lea rsi, [rsi + rax] ;ref_pixels_per_line michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rdi, dword ptr arg(3) ;src_pixels_per_line michael@0: %else michael@0: lea rdi, [rdi + r9] michael@0: %endif michael@0: michael@0: sub rcx, 1 michael@0: jnz .filter_block2d_bil_sp_only_loop michael@0: michael@0: jmp .filter_block2d_bil_variance michael@0: michael@0: .filter_block2d_bil_var_ssse3_full_pixel: michael@0: mov rsi, arg(0) ;ref_ptr michael@0: mov rdi, arg(2) ;src_ptr michael@0: movsxd rcx, dword ptr arg(4) ;Height michael@0: movsxd rax, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd rdx, dword ptr arg(3) ;src_pixels_per_line michael@0: pxor xmm0, xmm0 michael@0: michael@0: .filter_block2d_bil_full_pixel_loop: michael@0: movq xmm1, QWORD PTR [rsi] michael@0: punpcklbw xmm1, xmm0 michael@0: movq xmm2, QWORD PTR [rsi+8] michael@0: punpcklbw xmm2, xmm0 michael@0: michael@0: movq xmm3, QWORD PTR [rdi] michael@0: punpcklbw xmm3, xmm0 michael@0: movq xmm4, QWORD PTR [rdi+8] michael@0: punpcklbw xmm4, xmm0 michael@0: michael@0: psubw xmm1, xmm3 michael@0: psubw xmm2, xmm4 michael@0: paddw xmm6, xmm1 michael@0: paddw xmm6, xmm2 michael@0: pmaddwd xmm1, xmm1 michael@0: pmaddwd xmm2, xmm2 michael@0: paddd xmm7, xmm1 michael@0: paddd xmm7, xmm2 michael@0: michael@0: lea rsi, [rsi + rax] ;ref_pixels_per_line michael@0: lea rdi, [rdi + rdx] ;src_pixels_per_line michael@0: sub rcx, 1 michael@0: jnz .filter_block2d_bil_full_pixel_loop michael@0: michael@0: jmp .filter_block2d_bil_variance michael@0: michael@0: .filter_block2d_bil_var_ssse3_fp_only: michael@0: mov rsi, arg(0) ;ref_ptr michael@0: mov rdi, arg(2) ;src_ptr michael@0: movsxd rcx, dword ptr arg(4) ;Height michael@0: movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line michael@0: michael@0: pxor xmm0, xmm0 michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: %endif michael@0: michael@0: .filter_block2d_bil_fp_only_loop: michael@0: movdqu xmm1, XMMWORD PTR [rsi] michael@0: movdqu xmm2, XMMWORD PTR [rsi+1] michael@0: movdqa xmm3, xmm1 michael@0: michael@0: punpcklbw xmm1, xmm2 michael@0: punpckhbw xmm3, xmm2 michael@0: pmaddubsw xmm1, [rax] michael@0: pmaddubsw xmm3, [rax] michael@0: michael@0: paddw xmm1, [GLOBAL(xmm_bi_rd)] michael@0: paddw xmm3, [GLOBAL(xmm_bi_rd)] michael@0: psraw xmm1, xmm_filter_shift michael@0: psraw xmm3, xmm_filter_shift michael@0: michael@0: movq xmm2, XMMWORD PTR [rdi] michael@0: pxor xmm4, xmm4 michael@0: punpcklbw xmm2, xmm4 michael@0: movq xmm5, QWORD PTR [rdi+8] michael@0: punpcklbw xmm5, xmm4 michael@0: michael@0: psubw xmm1, xmm2 michael@0: psubw xmm3, xmm5 michael@0: paddw xmm6, xmm1 michael@0: paddw xmm6, xmm3 michael@0: pmaddwd xmm1, xmm1 michael@0: pmaddwd xmm3, xmm3 michael@0: paddd xmm7, xmm1 michael@0: paddd xmm7, xmm3 michael@0: michael@0: lea rsi, [rsi + rdx] michael@0: %if ABI_IS_32BIT michael@0: add rdi, dword ptr arg(3) ;src_pixels_per_line michael@0: %else michael@0: lea rdi, [rdi + r9] michael@0: %endif michael@0: michael@0: sub rcx, 1 michael@0: jnz .filter_block2d_bil_fp_only_loop michael@0: michael@0: jmp .filter_block2d_bil_variance michael@0: michael@0: .filter_block2d_bil_variance: michael@0: pxor xmm0, xmm0 michael@0: pxor xmm1, xmm1 michael@0: pxor xmm5, xmm5 michael@0: michael@0: punpcklwd xmm0, xmm6 michael@0: punpckhwd xmm1, xmm6 michael@0: psrad xmm0, 16 michael@0: psrad xmm1, 16 michael@0: paddd xmm0, xmm1 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: movdqa xmm6, xmm7 michael@0: punpckldq xmm6, xmm5 michael@0: punpckhdq xmm7, xmm5 michael@0: paddd xmm6, xmm7 michael@0: michael@0: punpckldq xmm0, xmm5 michael@0: punpckhdq xmm1, xmm5 michael@0: paddd xmm0, xmm1 michael@0: michael@0: movdqa xmm7, xmm6 michael@0: movdqa xmm1, xmm0 michael@0: michael@0: psrldq xmm7, 8 michael@0: psrldq xmm1, 8 michael@0: michael@0: paddd xmm6, xmm7 michael@0: paddd xmm0, xmm1 michael@0: michael@0: mov rsi, arg(7) ;[Sum] michael@0: mov rdi, arg(8) ;[SSE] michael@0: michael@0: movd [rsi], xmm0 michael@0: movd [rdi], xmm6 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: SECTION_RODATA michael@0: align 16 michael@0: xmm_bi_rd: michael@0: times 8 dw 64 michael@0: align 16 michael@0: vp8_bilinear_filters_ssse3: michael@0: times 8 db 128, 0 michael@0: times 8 db 112, 16 michael@0: times 8 db 96, 32 michael@0: times 8 db 80, 48 michael@0: times 8 db 64, 64 michael@0: times 8 db 48, 80 michael@0: times 8 db 32, 96 michael@0: times 8 db 16, 112