michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, michael@0: ; short *diff, unsigned char *Predictor, michael@0: ; int pitch); michael@0: global sym(vp8_subtract_b_mmx_impl) PRIVATE michael@0: sym(vp8_subtract_b_mmx_impl): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rdi, arg(2) ;diff michael@0: mov rax, arg(3) ;Predictor michael@0: mov rsi, arg(0) ;z michael@0: movsxd rdx, dword ptr arg(1);src_stride; michael@0: movsxd rcx, dword ptr arg(4);pitch michael@0: pxor mm7, mm7 michael@0: michael@0: movd mm0, [rsi] michael@0: movd mm1, [rax] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq [rdi], mm0 michael@0: michael@0: michael@0: movd mm0, [rsi+rdx] michael@0: movd mm1, [rax+rcx] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq [rdi+rcx*2],mm0 michael@0: michael@0: michael@0: movd mm0, [rsi+rdx*2] michael@0: movd mm1, [rax+rcx*2] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq [rdi+rcx*4], mm0 michael@0: michael@0: lea rsi, [rsi+rdx*2] michael@0: lea rcx, [rcx+rcx*2] michael@0: michael@0: michael@0: michael@0: movd mm0, [rsi+rdx] michael@0: movd mm1, [rax+rcx] michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: psubw mm0, mm1 michael@0: movq [rdi+rcx*2], mm0 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: ;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, michael@0: ;unsigned char *pred, int pred_stride) michael@0: global sym(vp8_subtract_mby_mmx) PRIVATE michael@0: sym(vp8_subtract_mby_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rdi, arg(0) ;diff michael@0: mov rsi, arg(1) ;src michael@0: movsxd rdx, dword ptr arg(2);src_stride michael@0: mov rax, arg(3) ;pred michael@0: push rbx michael@0: movsxd rbx, dword ptr arg(4);pred_stride michael@0: michael@0: pxor mm0, mm0 michael@0: mov rcx, 16 michael@0: michael@0: michael@0: .submby_loop: michael@0: movq mm1, [rsi] michael@0: movq mm3, [rax] michael@0: michael@0: movq mm2, mm1 michael@0: movq mm4, mm3 michael@0: michael@0: punpcklbw mm1, mm0 michael@0: punpcklbw mm3, mm0 michael@0: michael@0: punpckhbw mm2, mm0 michael@0: punpckhbw mm4, mm0 michael@0: michael@0: psubw mm1, mm3 michael@0: psubw mm2, mm4 michael@0: michael@0: movq [rdi], mm1 michael@0: movq [rdi+8], mm2 michael@0: michael@0: movq mm1, [rsi+8] michael@0: movq mm3, [rax+8] michael@0: michael@0: movq mm2, mm1 michael@0: movq mm4, mm3 michael@0: michael@0: punpcklbw mm1, mm0 michael@0: punpcklbw mm3, mm0 michael@0: michael@0: punpckhbw mm2, mm0 michael@0: punpckhbw mm4, mm0 michael@0: michael@0: psubw mm1, mm3 michael@0: psubw mm2, mm4 michael@0: michael@0: movq [rdi+16], mm1 michael@0: movq [rdi+24], mm2 michael@0: add rdi, 32 michael@0: lea rax, [rax+rbx] michael@0: lea rsi, [rsi+rdx] michael@0: dec rcx michael@0: jnz .submby_loop michael@0: michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, michael@0: ; int src_stride, unsigned char *upred, michael@0: ; unsigned char *vpred, int pred_stride) michael@0: michael@0: global sym(vp8_subtract_mbuv_mmx) PRIVATE michael@0: sym(vp8_subtract_mbuv_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rdi, arg(0) ;diff michael@0: mov rsi, arg(1) ;usrc michael@0: movsxd rdx, dword ptr arg(3);src_stride; michael@0: mov rax, arg(4) ;upred michael@0: add rdi, 256*2 ;diff = diff + 256 (shorts) michael@0: mov rcx, 8 michael@0: push rbx michael@0: movsxd rbx, dword ptr arg(6);pred_stride michael@0: michael@0: pxor mm7, mm7 michael@0: michael@0: .submbu_loop: michael@0: movq mm0, [rsi] michael@0: movq mm1, [rax] michael@0: movq mm3, mm0 michael@0: movq mm4, mm1 michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: punpckhbw mm3, mm7 michael@0: punpckhbw mm4, mm7 michael@0: psubw mm0, mm1 michael@0: psubw mm3, mm4 michael@0: movq [rdi], mm0 michael@0: movq [rdi+8], mm3 michael@0: add rdi, 16 michael@0: add rsi, rdx michael@0: add rax, rbx michael@0: michael@0: dec rcx michael@0: jnz .submbu_loop michael@0: michael@0: mov rsi, arg(2) ;vsrc michael@0: mov rax, arg(5) ;vpred michael@0: mov rcx, 8 michael@0: michael@0: .submbv_loop: michael@0: movq mm0, [rsi] michael@0: movq mm1, [rax] michael@0: movq mm3, mm0 michael@0: movq mm4, mm1 michael@0: punpcklbw mm0, mm7 michael@0: punpcklbw mm1, mm7 michael@0: punpckhbw mm3, mm7 michael@0: punpckhbw mm4, mm7 michael@0: psubw mm0, mm1 michael@0: psubw mm3, mm4 michael@0: movq [rdi], mm0 michael@0: movq [rdi+8], mm3 michael@0: add rdi, 16 michael@0: add rsi, rdx michael@0: add rax, rbx michael@0: michael@0: dec rcx michael@0: jnz .submbv_loop michael@0: michael@0: pop rbx michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret