diff -r 000000000000 -r 6474c204b198 media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,695 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +;void vp9_post_proc_down_and_across_xmm +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp9_post_proc_down_and_across_xmm) PRIVATE +sym(vp9_post_proc_down_and_across_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ALIGN_STACK 16, rax + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movdqa xmm0, [GLOBAL(rd42)] + sub rsp, 16 + movdqa [rsp], xmm0 +%define RD42 [rsp] +%else +%define RD42 [GLOBAL(rd42)] +%endif + + + movd xmm2, dword ptr arg(6) ;flimit + punpcklwd xmm2, xmm2 + punpckldq xmm2, xmm2 + punpcklqdq xmm2, xmm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor xmm0, xmm0 ; mm0 = 00000000 + +.nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +.nextcol: + movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 ; + + movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 + psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw xmm7, xmm2 + + movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + neg rax + movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi], xmm1 ; + + neg rax ; pitch is positive + add rsi, 8 + add rdi, 8 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + + jl .nextcol + + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + xor rdx, rdx + movq mm0, QWORD PTR [rdi-8]; + +.acrossnextcol: + movq xmm7, QWORD PTR [rdi +rdx -2] + movd xmm4, DWORD PTR [rdi +rdx +6] + + pslldq xmm4, 8 + por xmm4, xmm7 + + movdqa xmm3, xmm4 + psrldq xmm3, 2 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 + movdqa xmm1, xmm3 ; mm1 = p0..p3 + psllw xmm3, 2 + + + movdqa xmm5, xmm4 + psrldq xmm5, 3 + punpcklbw xmm5, xmm0 ; mm5 = p1..p4 + paddusw xmm3, xmm5 ; mm3 += mm6 + + ; thresholding + movdqa xmm7, xmm1 ; mm7 = p0..p3 + psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw xmm7, xmm2 + + movdqa xmm5, xmm4 + psrldq xmm5, 4 + punpcklbw xmm5, xmm0 ; mm5 = p2..p5 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + + movdqa xmm5, xmm4 ; mm5 = p-2..p5 + punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 + paddusw xmm3, xmm5 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + psrldq xmm4, 1 ; mm4 = p-1..p5 + punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 + paddusw xmm3, xmm4 ; mm3 += mm5 + + ; thresholding + movdqa xmm6, xmm1 ; mm6 = p0..p3 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 + psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 + paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw xmm6, xmm2 + por xmm7, xmm6 ; accumulate thresholds + + paddusw xmm3, RD42 ; mm3 += round value + psraw xmm3, 3 ; mm3 /= 8 + + pand xmm1, xmm7 ; mm1 select vals > thresh from source + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result + paddusw xmm1, xmm7 ; combination + + packuswb xmm1, xmm0 ; pack to bytes + movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes + movdq2q mm0, xmm1 + + add rdx, 8 + cmp edx, dword arg(5) ;cols + jl .acrossnextcol; + + ; last 8 pixels + movq QWORD PTR [rdi+rdx-8], mm0 + + ; done with this rwo + add rsi,rax ; next line + mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + add rsp,16 + pop rsp +%endif + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + RESTORE_XMM + UNSHADOW_ARGS + pop rbp + ret +%undef RD42 + + +;void vp9_mbpost_proc_down_xmm(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_xmm) PRIVATE +sym(vp9_mbpost_proc_down_xmm): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + SAVE_XMM 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 128+16 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax + mov [rsp+128+8], eax + mov [rsp+128+12], eax +%define flimit4 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword arg(2), 8 + + ;for(c=0; c