michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, michael@0: ; short *diff, unsigned char *Predictor, michael@0: ; int pitch); michael@0: global sym(vp8_subtract_b_sse2_impl) PRIVATE michael@0: sym(vp8_subtract_b_sse2_impl): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rdi, arg(2) ;diff michael@0: mov rax, arg(3) ;Predictor michael@0: mov rsi, arg(0) ;z michael@0: movsxd rdx, dword ptr arg(1);src_stride; michael@0: movsxd rcx, dword ptr arg(4);pitch michael@0: pxor mm7, mm7 michael@0: michael@0: movd mm0, [rsi] michael@0: movd mm1, [rax] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq MMWORD PTR [rdi], mm0 michael@0: michael@0: movd mm0, [rsi+rdx] michael@0: movd mm1, [rax+rcx] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq MMWORD PTR [rdi+rcx*2], mm0 michael@0: michael@0: movd mm0, [rsi+rdx*2] michael@0: movd mm1, [rax+rcx*2] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq MMWORD PTR [rdi+rcx*4], mm0 michael@0: michael@0: lea rsi, [rsi+rdx*2] michael@0: lea rcx, [rcx+rcx*2] michael@0: michael@0: movd mm0, [rsi+rdx] michael@0: movd mm1, [rax+rcx] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq MMWORD PTR [rdi+rcx*2], mm0 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, michael@0: ;unsigned char *pred, int pred_stride) michael@0: global sym(vp8_subtract_mby_sse2) PRIVATE michael@0: sym(vp8_subtract_mby_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rdi, arg(0) ;diff michael@0: mov rsi, arg(1) ;src michael@0: movsxd rdx, dword ptr arg(2);src_stride michael@0: mov rax, arg(3) ;pred michael@0: movdqa xmm4, [GLOBAL(t80)] michael@0: push rbx michael@0: mov rcx, 8 ; do two lines at one time michael@0: movsxd rbx, dword ptr arg(4);pred_stride michael@0: michael@0: .submby_loop: michael@0: movdqa xmm0, [rsi] ; src michael@0: movdqa xmm1, [rax] ; pred michael@0: michael@0: movdqa xmm2, xmm0 michael@0: psubb xmm0, xmm1 michael@0: michael@0: pxor xmm1, xmm4 ;convert to signed values michael@0: pxor xmm2, xmm4 michael@0: pcmpgtb xmm1, xmm2 ; obtain sign information michael@0: michael@0: movdqa xmm2, xmm0 michael@0: punpcklbw xmm0, xmm1 ; put sign back to subtraction michael@0: punpckhbw xmm2, xmm1 ; put sign back to subtraction michael@0: michael@0: movdqa xmm3, [rsi + rdx] michael@0: movdqa xmm5, [rax + rbx] michael@0: michael@0: lea rsi, [rsi+rdx*2] michael@0: lea rax, [rax+rbx*2] michael@0: michael@0: movdqa [rdi], xmm0 michael@0: movdqa [rdi +16], xmm2 michael@0: michael@0: movdqa xmm1, xmm3 michael@0: psubb xmm3, xmm5 michael@0: michael@0: pxor xmm5, xmm4 ;convert to signed values michael@0: pxor xmm1, xmm4 michael@0: pcmpgtb xmm5, xmm1 ; obtain sign information michael@0: michael@0: movdqa xmm1, xmm3 michael@0: punpcklbw xmm3, xmm5 ; put sign back to subtraction michael@0: punpckhbw xmm1, xmm5 ; put sign back to subtraction michael@0: michael@0: movdqa [rdi +32], xmm3 michael@0: movdqa [rdi +48], xmm1 michael@0: michael@0: add rdi, 64 michael@0: dec rcx michael@0: jnz .submby_loop michael@0: michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, michael@0: ; int src_stride, unsigned char *upred, michael@0: ; unsigned char *vpred, int pred_stride) michael@0: global sym(vp8_subtract_mbuv_sse2) PRIVATE michael@0: sym(vp8_subtract_mbuv_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: movdqa xmm4, [GLOBAL(t80)] michael@0: mov rdi, arg(0) ;diff michael@0: mov rsi, arg(1) ;usrc michael@0: movsxd rdx, dword ptr arg(3);src_stride; michael@0: mov rax, arg(4) ;upred michael@0: add rdi, 256*2 ;diff = diff + 256 (shorts) michael@0: mov rcx, 4 michael@0: push rbx michael@0: movsxd rbx, dword ptr arg(6);pred_stride michael@0: michael@0: ;u michael@0: .submbu_loop: michael@0: movq xmm0, [rsi] ; src michael@0: movq xmm2, [rsi+rdx] ; src -- next line michael@0: movq xmm1, [rax] ; pred michael@0: movq xmm3, [rax+rbx] ; pred -- next line michael@0: lea rsi, [rsi + rdx*2] michael@0: lea rax, [rax + rbx*2] michael@0: michael@0: punpcklqdq xmm0, xmm2 michael@0: punpcklqdq xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm0 michael@0: psubb xmm0, xmm1 ; subtraction with sign missed michael@0: michael@0: pxor xmm1, xmm4 ;convert to signed values michael@0: pxor xmm2, xmm4 michael@0: pcmpgtb xmm1, xmm2 ; obtain sign information michael@0: michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm1 michael@0: punpcklbw xmm0, xmm1 ; put sign back to subtraction michael@0: punpckhbw xmm2, xmm3 ; put sign back to subtraction michael@0: michael@0: movdqa [rdi], xmm0 ; store difference michael@0: movdqa [rdi +16], xmm2 ; store difference michael@0: add rdi, 32 michael@0: sub rcx, 1 michael@0: jnz .submbu_loop michael@0: michael@0: mov rsi, arg(2) ;vsrc michael@0: mov rax, arg(5) ;vpred michael@0: mov rcx, 4 michael@0: michael@0: ;v michael@0: .submbv_loop: michael@0: movq xmm0, [rsi] ; src michael@0: movq xmm2, [rsi+rdx] ; src -- next line michael@0: movq xmm1, [rax] ; pred michael@0: movq xmm3, [rax+rbx] ; pred -- next line michael@0: lea rsi, [rsi + rdx*2] michael@0: lea rax, [rax + rbx*2] michael@0: michael@0: punpcklqdq xmm0, xmm2 michael@0: punpcklqdq xmm1, xmm3 michael@0: michael@0: movdqa xmm2, xmm0 michael@0: psubb xmm0, xmm1 ; subtraction with sign missed michael@0: michael@0: pxor xmm1, xmm4 ;convert to signed values michael@0: pxor xmm2, xmm4 michael@0: pcmpgtb xmm1, xmm2 ; obtain sign information michael@0: michael@0: movdqa xmm2, xmm0 michael@0: movdqa xmm3, xmm1 michael@0: punpcklbw xmm0, xmm1 ; put sign back to subtraction michael@0: punpckhbw xmm2, xmm3 ; put sign back to subtraction michael@0: michael@0: movdqa [rdi], xmm0 ; store difference michael@0: movdqa [rdi +16], xmm2 ; store difference michael@0: add rdi, 32 michael@0: sub rcx, 1 michael@0: jnz .submbv_loop michael@0: michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: SECTION_RODATA michael@0: align 16 michael@0: t80: michael@0: times 16 db 0x80