michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0: %include "vpx_ports/x86_abi_support.asm"
michael@0: 
michael@0: ;void vp8_subtract_b_mmx_impl(unsigned char *z,  int src_stride,
michael@0: ;                            short *diff, unsigned char *Predictor,
michael@0: ;                            int pitch);
michael@0: global sym(vp8_subtract_b_mmx_impl) PRIVATE
michael@0: sym(vp8_subtract_b_mmx_impl):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 5
michael@0:     push rsi
michael@0:     push rdi
michael@0:     ; end prolog
michael@0: 
michael@0: 
michael@0:         mov     rdi,        arg(2) ;diff
michael@0:         mov     rax,        arg(3) ;Predictor
michael@0:         mov     rsi,        arg(0) ;z
michael@0:         movsxd  rdx,        dword ptr arg(1);src_stride;
michael@0:         movsxd  rcx,        dword ptr arg(4);pitch
michael@0:         pxor    mm7,        mm7
michael@0: 
michael@0:         movd    mm0,        [rsi]
michael@0:         movd    mm1,        [rax]
michael@0:         punpcklbw   mm0,    mm7
michael@0:         punpcklbw   mm1,    mm7
michael@0:         psubw   mm0,        mm1
michael@0:         movq    [rdi],      mm0
michael@0: 
michael@0: 
michael@0:         movd    mm0,        [rsi+rdx]
michael@0:         movd    mm1,        [rax+rcx]
michael@0:         punpcklbw   mm0,    mm7
michael@0:         punpcklbw   mm1,    mm7
michael@0:         psubw   mm0,        mm1
michael@0:         movq    [rdi+rcx*2],mm0
michael@0: 
michael@0: 
michael@0:         movd    mm0,        [rsi+rdx*2]
michael@0:         movd    mm1,        [rax+rcx*2]
michael@0:         punpcklbw   mm0,    mm7
michael@0:         punpcklbw   mm1,    mm7
michael@0:         psubw   mm0,        mm1
michael@0:         movq    [rdi+rcx*4],        mm0
michael@0: 
michael@0:         lea     rsi,        [rsi+rdx*2]
michael@0:         lea     rcx,        [rcx+rcx*2]
michael@0: 
michael@0: 
michael@0: 
michael@0:         movd    mm0,        [rsi+rdx]
michael@0:         movd    mm1,        [rax+rcx]
michael@0:         punpcklbw   mm0,    mm7
michael@0:         punpcklbw   mm1,    mm7
michael@0:         psubw   mm0,        mm1
michael@0:         movq    [rdi+rcx*2],        mm0
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: ;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride,
michael@0: ;unsigned char *pred, int pred_stride)
michael@0: global sym(vp8_subtract_mby_mmx) PRIVATE
michael@0: sym(vp8_subtract_mby_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 5
michael@0:     push rsi
michael@0:     push rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     mov         rdi,        arg(0)          ;diff
michael@0:     mov         rsi,        arg(1)          ;src
michael@0:     movsxd      rdx,        dword ptr arg(2);src_stride
michael@0:     mov         rax,        arg(3)          ;pred
michael@0:     push        rbx
michael@0:     movsxd      rbx,        dword ptr arg(4);pred_stride
michael@0: 
michael@0:     pxor        mm0,        mm0
michael@0:     mov         rcx,        16
michael@0: 
michael@0: 
michael@0: .submby_loop:
michael@0:     movq        mm1,        [rsi]
michael@0:     movq        mm3,        [rax]
michael@0: 
michael@0:     movq        mm2,        mm1
michael@0:     movq        mm4,        mm3
michael@0: 
michael@0:     punpcklbw   mm1,        mm0
michael@0:     punpcklbw   mm3,        mm0
michael@0: 
michael@0:     punpckhbw   mm2,        mm0
michael@0:     punpckhbw   mm4,        mm0
michael@0: 
michael@0:     psubw       mm1,        mm3
michael@0:     psubw       mm2,        mm4
michael@0: 
michael@0:     movq        [rdi],      mm1
michael@0:     movq        [rdi+8],    mm2
michael@0: 
michael@0:     movq        mm1,        [rsi+8]
michael@0:     movq        mm3,        [rax+8]
michael@0: 
michael@0:     movq        mm2,        mm1
michael@0:     movq        mm4,        mm3
michael@0: 
michael@0:     punpcklbw   mm1,        mm0
michael@0:     punpcklbw   mm3,        mm0
michael@0: 
michael@0:     punpckhbw   mm2,        mm0
michael@0:     punpckhbw   mm4,        mm0
michael@0: 
michael@0:     psubw       mm1,        mm3
michael@0:     psubw       mm2,        mm4
michael@0: 
michael@0:     movq        [rdi+16],   mm1
michael@0:     movq        [rdi+24],   mm2
michael@0:     add         rdi,        32
michael@0:     lea         rax,        [rax+rbx]
michael@0:     lea         rsi,        [rsi+rdx]
michael@0:     dec         rcx
michael@0:     jnz         .submby_loop
michael@0: 
michael@0:     pop rbx
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     ; begin epilog
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: ;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc,
michael@0: ;                         int src_stride, unsigned char *upred,
michael@0: ;                         unsigned char *vpred, int pred_stride)
michael@0: 
michael@0: global sym(vp8_subtract_mbuv_mmx) PRIVATE
michael@0: sym(vp8_subtract_mbuv_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 7
michael@0:     push rsi
michael@0:     push rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     mov         rdi,        arg(0)          ;diff
michael@0:     mov         rsi,        arg(1)          ;usrc
michael@0:     movsxd      rdx,        dword ptr arg(3);src_stride;
michael@0:     mov         rax,        arg(4)          ;upred
michael@0:     add         rdi,        256*2           ;diff = diff + 256 (shorts)
michael@0:     mov         rcx,        8
michael@0:     push        rbx
michael@0:     movsxd      rbx,        dword ptr arg(6);pred_stride
michael@0: 
michael@0:     pxor        mm7,        mm7
michael@0: 
michael@0: .submbu_loop:
michael@0:     movq        mm0,        [rsi]
michael@0:     movq        mm1,        [rax]
michael@0:     movq        mm3,        mm0
michael@0:     movq        mm4,        mm1
michael@0:     punpcklbw   mm0,        mm7
michael@0:     punpcklbw   mm1,        mm7
michael@0:     punpckhbw   mm3,        mm7
michael@0:     punpckhbw   mm4,        mm7
michael@0:     psubw       mm0,        mm1
michael@0:     psubw       mm3,        mm4
michael@0:     movq        [rdi],      mm0
michael@0:     movq        [rdi+8],    mm3
michael@0:     add         rdi, 16
michael@0:     add         rsi, rdx
michael@0:     add         rax, rbx
michael@0: 
michael@0:     dec         rcx
michael@0:     jnz         .submbu_loop
michael@0: 
michael@0:     mov         rsi,        arg(2)          ;vsrc
michael@0:     mov         rax,        arg(5)          ;vpred
michael@0:     mov         rcx,        8
michael@0: 
michael@0: .submbv_loop:
michael@0:     movq        mm0,        [rsi]
michael@0:     movq        mm1,        [rax]
michael@0:     movq        mm3,        mm0
michael@0:     movq        mm4,        mm1
michael@0:     punpcklbw   mm0,        mm7
michael@0:     punpcklbw   mm1,        mm7
michael@0:     punpckhbw   mm3,        mm7
michael@0:     punpckhbw   mm4,        mm7
michael@0:     psubw       mm0,        mm1
michael@0:     psubw       mm3,        mm4
michael@0:     movq        [rdi],      mm0
michael@0:     movq        [rdi+8],    mm3
michael@0:     add         rdi, 16
michael@0:     add         rsi, rdx
michael@0:     add         rax, rbx
michael@0: 
michael@0:     dec         rcx
michael@0:     jnz         .submbv_loop
michael@0: 
michael@0:     pop         rbx
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret