diff -r 000000000000 -r 6474c204b198 media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm Wed Dec 31 06:09:35 2014 +0100 @@ -0,0 +1,534 @@ +; +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. +; +; Use of this source code is governed by a BSD-style license +; that can be found in the LICENSE file in the root of the source +; tree. An additional intellectual property rights grant can be found +; in the file PATENTS. All contributing project authors may +; be found in the AUTHORS file in the root of the source tree. +; + + +%include "vpx_ports/x86_abi_support.asm" + +%define VP9_FILTER_WEIGHT 128 +%define VP9_FILTER_SHIFT 7 + +;void vp9_post_proc_down_and_across_mmx +;( +; unsigned char *src_ptr, +; unsigned char *dst_ptr, +; int src_pixels_per_line, +; int dst_pixels_per_line, +; int rows, +; int cols, +; int flimit +;) +global sym(vp9_post_proc_down_and_across_mmx) PRIVATE +sym(vp9_post_proc_down_and_across_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 7 + GET_GOT rbx + push rsi + push rdi + ; end prolog + +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 + ; move the global rd onto the stack, since we don't have enough registers + ; to do PIC addressing + movq mm0, [GLOBAL(rd)] + sub rsp, 8 + movq [rsp], mm0 +%define RD [rsp] +%else +%define RD [GLOBAL(rd)] +%endif + + push rbx + lea rbx, [GLOBAL(Blur)] + movd mm2, dword ptr arg(6) ;flimit + punpcklwd mm2, mm2 + punpckldq mm2, mm2 + + mov rsi, arg(0) ;src_ptr + mov rdi, arg(1) ;dst_ptr + + movsxd rcx, DWORD PTR arg(4) ;rows + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? + pxor mm0, mm0 ; mm0 = 00000000 + +.nextrow: + + xor rdx, rdx ; clear out rdx for use as loop counter +.nextcol: + + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps + movq mm3, [rsi] ; mm4 = r0 p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] ; mm6 = kernel 3 taps + movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 + pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = r0 p0..p3 + psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 + psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 + paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers + movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 + psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + neg rax + movq mm6, [rbx ] ; kernel 0 taps + movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 + punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] ; kernel 1 taps + movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 + punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = r0 p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 + psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + + movd [rdi], mm1 ; + neg rax ; pitch is positive + + + add rsi, 4 + add rdi, 4 + add rdx, 4 + + cmp edx, dword ptr arg(5) ;cols + jl .nextcol + ; done with the all cols, start the across filtering in place + sub rsi, rdx + sub rdi, rdx + + + push rax + xor rdx, rdx + mov rax, [rdi-4]; + +.acrossnextcol: + pxor mm7, mm7 ; mm7 = 00000000 + movq mm6, [rbx + 32 ] ; + movq mm4, [rdi+rdx] ; mm4 = p0..p7 + movq mm3, mm4 ; mm3 = p0..p7 + punpcklbw mm3, mm0 ; mm3 = p0..p3 + movq mm1, mm3 ; mm1 = p0..p3 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers + + movq mm6, [rbx + 48] + psrlq mm4, 8 ; mm4 = p1..p7 + movq mm5, mm4 ; mm5 = p1..p7 + punpcklbw mm5, mm0 ; mm5 = p1..p4 + pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers + paddusw mm3, mm6 ; mm3 += mm6 + + ; thresholding + movq mm7, mm1 ; mm7 = p0..p3 + psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) + pcmpgtw mm7, mm2 + + movq mm6, [rbx + 64 ] + psrlq mm4, 8 ; mm4 = p2..p7 + movq mm5, mm4 ; mm5 = p2..p7 + punpcklbw mm5, mm0 ; mm5 = p2..p5 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + + movq mm6, [rbx ] + movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 + movq mm5, mm4 ; mm5 = p-2..p5 + punpcklbw mm5, mm0 ; mm5 = p-2..p1 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + movq mm6, [rbx + 16] + psrlq mm4, 8 ; mm4 = p-1..p5 + punpcklbw mm4, mm0 ; mm4 = p-1..p2 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. + paddusw mm3, mm6 ; mm3 += mm5 + + ; thresholding + movq mm6, mm1 ; mm6 = p0..p3 + psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 + psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 + paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) + pcmpgtw mm6, mm2 + por mm7, mm6 ; accumulate thresholds + + paddusw mm3, RD ; mm3 += round value + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 + + pand mm1, mm7 ; mm1 select vals > thresh from source + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result + paddusw mm1, mm7 ; combination + + packuswb mm1, mm0 ; pack to bytes + mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes + movd eax, mm1 + + add rdx, 4 + cmp edx, dword ptr arg(5) ;cols + jl .acrossnextcol; + + mov DWORD PTR [rdi+rdx-4], eax + pop rax + + ; done with this rwo + add rsi,rax ; next line + movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? + add rdi,rax ; next destination + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? + + dec rcx ; decrement count + jnz .nextrow ; next row + pop rbx + + ; begin epilog + pop rdi + pop rsi + RESTORE_GOT + UNSHADOW_ARGS + pop rbp + ret +%undef RD + + +;void vp9_mbpost_proc_down_mmx(unsigned char *dst, +; int pitch, int rows, int cols,int flimit) +extern sym(vp9_rv) +global sym(vp9_mbpost_proc_down_mmx) PRIVATE +sym(vp9_mbpost_proc_down_mmx): + push rbp + mov rbp, rsp + SHADOW_ARGS_TO_STACK 5 + GET_GOT rbx + push rsi + push rdi + ; end prolog + + ALIGN_STACK 16, rax + sub rsp, 136 + + ; unsigned char d[16][8] at [rsp] + ; create flimit2 at [rsp+128] + mov eax, dword ptr arg(4) ;flimit + mov [rsp+128], eax + mov [rsp+128+4], eax +%define flimit2 [rsp+128] + +%if ABI_IS_32BIT=0 + lea r8, [GLOBAL(sym(vp9_rv))] +%endif + + ;rows +=8; + add dword ptr arg(2), 8 + + ;for(c=0; c