michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %define VP9_FILTER_WEIGHT 128 michael@0: %define VP9_FILTER_SHIFT 7 michael@0: michael@0: ;void vp9_post_proc_down_and_across_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; unsigned char *dst_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; int dst_pixels_per_line, michael@0: ; int rows, michael@0: ; int cols, michael@0: ; int flimit michael@0: ;) michael@0: global sym(vp9_post_proc_down_and_across_mmx) PRIVATE michael@0: sym(vp9_post_proc_down_and_across_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: %if ABI_IS_32BIT=1 && CONFIG_PIC=1 michael@0: ; move the global rd onto the stack, since we don't have enough registers michael@0: ; to do PIC addressing michael@0: movq mm0, [GLOBAL(rd)] michael@0: sub rsp, 8 michael@0: movq [rsp], mm0 michael@0: %define RD [rsp] michael@0: %else michael@0: %define RD [GLOBAL(rd)] michael@0: %endif michael@0: michael@0: push rbx michael@0: lea rbx, [GLOBAL(Blur)] michael@0: movd mm2, dword ptr arg(6) ;flimit michael@0: punpcklwd mm2, mm2 michael@0: punpckldq mm2, mm2 michael@0: michael@0: mov rsi, arg(0) ;src_ptr michael@0: mov rdi, arg(1) ;dst_ptr michael@0: michael@0: movsxd rcx, DWORD PTR arg(4) ;rows michael@0: movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? michael@0: pxor mm0, mm0 ; mm0 = 00000000 michael@0: michael@0: .nextrow: michael@0: michael@0: xor rdx, rdx ; clear out rdx for use as loop counter michael@0: .nextcol: michael@0: michael@0: pxor mm7, mm7 ; mm7 = 00000000 michael@0: movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps michael@0: movq mm3, [rsi] ; mm4 = r0 p0..p7 michael@0: punpcklbw mm3, mm0 ; mm3 = p0..p3 michael@0: movq mm1, mm3 ; mm1 = p0..p3 michael@0: pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers michael@0: michael@0: movq mm6, [rbx + 48] ; mm6 = kernel 3 taps michael@0: movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 michael@0: punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 michael@0: pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers michael@0: paddusw mm3, mm6 ; mm3 += mm6 michael@0: michael@0: ; thresholding michael@0: movq mm7, mm1 ; mm7 = r0 p0..p3 michael@0: psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 michael@0: psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 michael@0: paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) michael@0: pcmpgtw mm7, mm2 michael@0: michael@0: movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers michael@0: movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 michael@0: punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 michael@0: pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers michael@0: paddusw mm3, mm6 ; mm3 += mm5 michael@0: michael@0: ; thresholding michael@0: movq mm6, mm1 ; mm6 = r0 p0..p3 michael@0: psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 michael@0: psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 michael@0: paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) michael@0: pcmpgtw mm6, mm2 michael@0: por mm7, mm6 ; accumulate thresholds michael@0: michael@0: michael@0: neg rax michael@0: movq mm6, [rbx ] ; kernel 0 taps michael@0: movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 michael@0: punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 michael@0: pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers michael@0: paddusw mm3, mm6 ; mm3 += mm5 michael@0: michael@0: ; thresholding michael@0: movq mm6, mm1 ; mm6 = r0 p0..p3 michael@0: psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 michael@0: psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 michael@0: paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) michael@0: pcmpgtw mm6, mm2 michael@0: por mm7, mm6 ; accumulate thresholds michael@0: michael@0: movq mm6, [rbx + 16] ; kernel 1 taps michael@0: movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 michael@0: punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 michael@0: pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. michael@0: paddusw mm3, mm6 ; mm3 += mm5 michael@0: michael@0: ; thresholding michael@0: movq mm6, mm1 ; mm6 = r0 p0..p3 michael@0: psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 michael@0: psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 michael@0: paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) michael@0: pcmpgtw mm6, mm2 michael@0: por mm7, mm6 ; accumulate thresholds michael@0: michael@0: michael@0: paddusw mm3, RD ; mm3 += round value michael@0: psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 michael@0: michael@0: pand mm1, mm7 ; mm1 select vals > thresh from source michael@0: pandn mm7, mm3 ; mm7 select vals < thresh from blurred result michael@0: paddusw mm1, mm7 ; combination michael@0: michael@0: packuswb mm1, mm0 ; pack to bytes michael@0: michael@0: movd [rdi], mm1 ; michael@0: neg rax ; pitch is positive michael@0: michael@0: michael@0: add rsi, 4 michael@0: add rdi, 4 michael@0: add rdx, 4 michael@0: michael@0: cmp edx, dword ptr arg(5) ;cols michael@0: jl .nextcol michael@0: ; done with the all cols, start the across filtering in place michael@0: sub rsi, rdx michael@0: sub rdi, rdx michael@0: michael@0: michael@0: push rax michael@0: xor rdx, rdx michael@0: mov rax, [rdi-4]; michael@0: michael@0: .acrossnextcol: michael@0: pxor mm7, mm7 ; mm7 = 00000000 michael@0: movq mm6, [rbx + 32 ] ; michael@0: movq mm4, [rdi+rdx] ; mm4 = p0..p7 michael@0: movq mm3, mm4 ; mm3 = p0..p7 michael@0: punpcklbw mm3, mm0 ; mm3 = p0..p3 michael@0: movq mm1, mm3 ; mm1 = p0..p3 michael@0: pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers michael@0: michael@0: movq mm6, [rbx + 48] michael@0: psrlq mm4, 8 ; mm4 = p1..p7 michael@0: movq mm5, mm4 ; mm5 = p1..p7 michael@0: punpcklbw mm5, mm0 ; mm5 = p1..p4 michael@0: pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers michael@0: paddusw mm3, mm6 ; mm3 += mm6 michael@0: michael@0: ; thresholding michael@0: movq mm7, mm1 ; mm7 = p0..p3 michael@0: psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 michael@0: psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 michael@0: paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) michael@0: pcmpgtw mm7, mm2 michael@0: michael@0: movq mm6, [rbx + 64 ] michael@0: psrlq mm4, 8 ; mm4 = p2..p7 michael@0: movq mm5, mm4 ; mm5 = p2..p7 michael@0: punpcklbw mm5, mm0 ; mm5 = p2..p5 michael@0: pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers michael@0: paddusw mm3, mm6 ; mm3 += mm5 michael@0: michael@0: ; thresholding michael@0: movq mm6, mm1 ; mm6 = p0..p3 michael@0: psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 michael@0: psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 michael@0: paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) michael@0: pcmpgtw mm6, mm2 michael@0: por mm7, mm6 ; accumulate thresholds michael@0: michael@0: michael@0: movq mm6, [rbx ] michael@0: movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 michael@0: movq mm5, mm4 ; mm5 = p-2..p5 michael@0: punpcklbw mm5, mm0 ; mm5 = p-2..p1 michael@0: pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers michael@0: paddusw mm3, mm6 ; mm3 += mm5 michael@0: michael@0: ; thresholding michael@0: movq mm6, mm1 ; mm6 = p0..p3 michael@0: psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 michael@0: psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 michael@0: paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) michael@0: pcmpgtw mm6, mm2 michael@0: por mm7, mm6 ; accumulate thresholds michael@0: michael@0: movq mm6, [rbx + 16] michael@0: psrlq mm4, 8 ; mm4 = p-1..p5 michael@0: punpcklbw mm4, mm0 ; mm4 = p-1..p2 michael@0: pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. michael@0: paddusw mm3, mm6 ; mm3 += mm5 michael@0: michael@0: ; thresholding michael@0: movq mm6, mm1 ; mm6 = p0..p3 michael@0: psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 michael@0: psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 michael@0: paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) michael@0: pcmpgtw mm6, mm2 michael@0: por mm7, mm6 ; accumulate thresholds michael@0: michael@0: paddusw mm3, RD ; mm3 += round value michael@0: psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 michael@0: michael@0: pand mm1, mm7 ; mm1 select vals > thresh from source michael@0: pandn mm7, mm3 ; mm7 select vals < thresh from blurred result michael@0: paddusw mm1, mm7 ; combination michael@0: michael@0: packuswb mm1, mm0 ; pack to bytes michael@0: mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes michael@0: movd eax, mm1 michael@0: michael@0: add rdx, 4 michael@0: cmp edx, dword ptr arg(5) ;cols michael@0: jl .acrossnextcol; michael@0: michael@0: mov DWORD PTR [rdi+rdx-4], eax michael@0: pop rax michael@0: michael@0: ; done with this rwo michael@0: add rsi,rax ; next line michael@0: movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? michael@0: add rdi,rax ; next destination michael@0: movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? michael@0: michael@0: dec rcx ; decrement count michael@0: jnz .nextrow ; next row michael@0: pop rbx michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: %undef RD michael@0: michael@0: michael@0: ;void vp9_mbpost_proc_down_mmx(unsigned char *dst, michael@0: ; int pitch, int rows, int cols,int flimit) michael@0: extern sym(vp9_rv) michael@0: global sym(vp9_mbpost_proc_down_mmx) PRIVATE michael@0: sym(vp9_mbpost_proc_down_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: ALIGN_STACK 16, rax michael@0: sub rsp, 136 michael@0: michael@0: ; unsigned char d[16][8] at [rsp] michael@0: ; create flimit2 at [rsp+128] michael@0: mov eax, dword ptr arg(4) ;flimit michael@0: mov [rsp+128], eax michael@0: mov [rsp+128+4], eax michael@0: %define flimit2 [rsp+128] michael@0: michael@0: %if ABI_IS_32BIT=0 michael@0: lea r8, [GLOBAL(sym(vp9_rv))] michael@0: %endif michael@0: michael@0: ;rows +=8; michael@0: add dword ptr arg(2), 8 michael@0: michael@0: ;for(c=0; c