michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;macro in deblock functions michael@0: %macro FIRST_2_ROWS 0 michael@0: movdqa xmm4, xmm0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm5, xmm1 michael@0: pavgb xmm5, xmm3 michael@0: michael@0: ;calculate absolute value michael@0: psubusb xmm4, xmm1 michael@0: psubusb xmm1, xmm0 michael@0: psubusb xmm6, xmm3 michael@0: psubusb xmm3, xmm0 michael@0: paddusb xmm4, xmm1 michael@0: paddusb xmm6, xmm3 michael@0: michael@0: ;get threshold michael@0: movdqa xmm2, flimit michael@0: pxor xmm1, xmm1 michael@0: movdqa xmm7, xmm2 michael@0: michael@0: ;get mask michael@0: psubusb xmm2, xmm4 michael@0: psubusb xmm7, xmm6 michael@0: pcmpeqb xmm2, xmm1 michael@0: pcmpeqb xmm7, xmm1 michael@0: por xmm7, xmm2 michael@0: %endmacro michael@0: michael@0: %macro SECOND_2_ROWS 0 michael@0: movdqa xmm6, xmm0 michael@0: movdqa xmm4, xmm0 michael@0: movdqa xmm2, xmm1 michael@0: pavgb xmm1, xmm3 michael@0: michael@0: ;calculate absolute value michael@0: psubusb xmm6, xmm2 michael@0: psubusb xmm2, xmm0 michael@0: psubusb xmm4, xmm3 michael@0: psubusb xmm3, xmm0 michael@0: paddusb xmm6, xmm2 michael@0: paddusb xmm4, xmm3 michael@0: michael@0: pavgb xmm5, xmm1 michael@0: michael@0: ;get threshold michael@0: movdqa xmm2, flimit michael@0: pxor xmm1, xmm1 michael@0: movdqa xmm3, xmm2 michael@0: michael@0: ;get mask michael@0: psubusb xmm2, xmm6 michael@0: psubusb xmm3, xmm4 michael@0: pcmpeqb xmm2, xmm1 michael@0: pcmpeqb xmm3, xmm1 michael@0: michael@0: por xmm7, xmm2 michael@0: por xmm7, xmm3 michael@0: michael@0: pavgb xmm5, xmm0 michael@0: michael@0: ;decide if or not to use filtered value michael@0: pand xmm0, xmm7 michael@0: pandn xmm7, xmm5 michael@0: paddusb xmm0, xmm7 michael@0: %endmacro michael@0: michael@0: %macro UPDATE_FLIMIT 0 michael@0: movdqa xmm2, XMMWORD PTR [rbx] michael@0: movdqa [rsp], xmm2 michael@0: add rbx, 16 michael@0: %endmacro michael@0: michael@0: ;void vp8_post_proc_down_and_across_mb_row_sse2 michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned char *dst_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; int dst_pixels_per_line, michael@0: ; int cols, michael@0: ; int *flimits, michael@0: ; int size michael@0: ;) michael@0: global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE michael@0: sym(vp8_post_proc_down_and_across_mb_row_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: SAVE_XMM 7 michael@0: push rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 16 michael@0: michael@0: ; put flimit on stack michael@0: mov rbx, arg(5) ;flimits ptr michael@0: UPDATE_FLIMIT michael@0: michael@0: %define flimit [rsp] michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(1) ;dst_ptr michael@0: michael@0: movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line michael@0: movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock michael@0: .nextrow: michael@0: xor rdx, rdx ;col michael@0: .nextcol: michael@0: ;load current and next 2 rows michael@0: movdqu xmm0, XMMWORD PTR [rsi] michael@0: movdqu xmm1, XMMWORD PTR [rsi + rax] michael@0: movdqu xmm3, XMMWORD PTR [rsi + 2*rax] michael@0: michael@0: FIRST_2_ROWS michael@0: michael@0: ;load above 2 rows michael@0: neg rax michael@0: movdqu xmm1, XMMWORD PTR [rsi + 2*rax] michael@0: movdqu xmm3, XMMWORD PTR [rsi + rax] michael@0: michael@0: SECOND_2_ROWS michael@0: michael@0: movdqu XMMWORD PTR [rdi], xmm0 michael@0: michael@0: neg rax ; positive stride michael@0: add rsi, 16 michael@0: add rdi, 16 michael@0: michael@0: add rdx, 16 michael@0: cmp edx, dword arg(4) ;cols michael@0: jge .downdone michael@0: UPDATE_FLIMIT michael@0: jmp .nextcol michael@0: michael@0: .downdone: michael@0: ; done with the all cols, start the across filtering in place michael@0: sub rsi, rdx michael@0: sub rdi, rdx michael@0: michael@0: mov rbx, arg(5) ; flimits michael@0: UPDATE_FLIMIT michael@0: michael@0: ; dup the first byte into the left border 8 times michael@0: movq mm1, [rdi] michael@0: punpcklbw mm1, mm1 michael@0: punpcklwd mm1, mm1 michael@0: punpckldq mm1, mm1 michael@0: mov rdx, -8 michael@0: movq [rdi+rdx], mm1 michael@0: michael@0: ; dup the last byte into the right border michael@0: movsxd rdx, dword arg(4) michael@0: movq mm1, [rdi + rdx + -1] michael@0: punpcklbw mm1, mm1 michael@0: punpcklwd mm1, mm1 michael@0: punpckldq mm1, mm1 michael@0: movq [rdi+rdx], mm1 michael@0: michael@0: xor rdx, rdx michael@0: movq mm0, QWORD PTR [rdi-16]; michael@0: movq mm1, QWORD PTR [rdi-8]; michael@0: michael@0: .acrossnextcol: michael@0: movdqu xmm0, XMMWORD PTR [rdi + rdx] michael@0: movdqu xmm1, XMMWORD PTR [rdi + rdx -2] michael@0: movdqu xmm3, XMMWORD PTR [rdi + rdx -1] michael@0: michael@0: FIRST_2_ROWS michael@0: michael@0: movdqu xmm1, XMMWORD PTR [rdi + rdx +1] michael@0: movdqu xmm3, XMMWORD PTR [rdi + rdx +2] michael@0: michael@0: SECOND_2_ROWS michael@0: michael@0: movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes michael@0: movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes michael@0: movdq2q mm0, xmm0 michael@0: psrldq xmm0, 8 michael@0: movdq2q mm1, xmm0 michael@0: michael@0: add rdx, 16 michael@0: cmp edx, dword arg(4) ;cols michael@0: jge .acrossdone michael@0: UPDATE_FLIMIT michael@0: jmp .acrossnextcol michael@0: michael@0: .acrossdone michael@0: ; last 16 pixels michael@0: movq QWORD PTR [rdi+rdx-16], mm0 michael@0: michael@0: cmp edx, dword arg(4) michael@0: jne .throw_last_8 michael@0: movq QWORD PTR [rdi+rdx-8], mm1 michael@0: .throw_last_8: michael@0: ; done with this rwo michael@0: add rsi,rax ;next src line michael@0: mov eax, dword arg(3) ;dst_pixels_per_line michael@0: add rdi,rax ;next destination michael@0: mov eax, dword arg(2) ;src_pixels_per_line michael@0: michael@0: mov rbx, arg(5) ;flimits michael@0: UPDATE_FLIMIT michael@0: michael@0: dec rcx ;decrement count michael@0: jnz .nextrow ;next row michael@0: michael@0: add rsp, 16 michael@0: pop rsp michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: pop rbx michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: %undef flimit michael@0: michael@0: ;void vp8_mbpost_proc_down_xmm(unsigned char *dst, michael@0: ; int pitch, int rows, int cols,int flimit) michael@0: extern sym(vp8_rv) michael@0: global sym(vp8_mbpost_proc_down_xmm) PRIVATE michael@0: sym(vp8_mbpost_proc_down_xmm): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: SAVE_XMM 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 128+16 michael@0: michael@0: ; unsigned char d[16][8] at [rsp] michael@0: ; create flimit2 at [rsp+128] michael@0: mov eax, dword ptr arg(4) ;flimit michael@0: mov [rsp+128], eax michael@0: mov [rsp+128+4], eax michael@0: mov [rsp+128+8], eax michael@0: mov [rsp+128+12], eax michael@0: %define flimit4 [rsp+128] michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: lea r8, [GLOBAL(sym(vp8_rv))] michael@0: %endif michael@0: michael@0: ;rows +=8; michael@0: add dword arg(2), 8 michael@0: michael@0: ;for(c=0; c