michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0: %include "vpx_ports/x86_abi_support.asm"
michael@0: 
michael@0: ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr )
michael@0: global sym(vp8_get_mb_ss_mmx) PRIVATE
michael@0: sym(vp8_get_mb_ss_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 7
michael@0:     GET_GOT     rbx
michael@0:     push rsi
michael@0:     push rdi
michael@0:     sub         rsp, 8
michael@0:     ; end prolog
michael@0: 
michael@0:         mov         rax, arg(0) ;src_ptr
michael@0:         mov         rcx, 16
michael@0:         pxor        mm4, mm4
michael@0: 
michael@0: .NEXTROW:
michael@0:         movq        mm0, [rax]
michael@0:         movq        mm1, [rax+8]
michael@0:         movq        mm2, [rax+16]
michael@0:         movq        mm3, [rax+24]
michael@0:         pmaddwd     mm0, mm0
michael@0:         pmaddwd     mm1, mm1
michael@0:         pmaddwd     mm2, mm2
michael@0:         pmaddwd     mm3, mm3
michael@0: 
michael@0:         paddd       mm4, mm0
michael@0:         paddd       mm4, mm1
michael@0:         paddd       mm4, mm2
michael@0:         paddd       mm4, mm3
michael@0: 
michael@0:         add         rax, 32
michael@0:         dec         rcx
michael@0:         ja          .NEXTROW
michael@0:         movq        QWORD PTR [rsp], mm4
michael@0: 
michael@0:         ;return sum[0]+sum[1];
michael@0:         movsxd      rax, dword ptr [rsp]
michael@0:         movsxd      rcx, dword ptr [rsp+4]
michael@0:         add         rax, rcx
michael@0: 
michael@0: 
michael@0:     ; begin epilog
michael@0:     add rsp, 8
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: ;unsigned int vp8_get8x8var_mmx
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int  source_stride,
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int  recon_stride,
michael@0: ;    unsigned int *SSE,
michael@0: ;    int *Sum
michael@0: ;)
michael@0: global sym(vp8_get8x8var_mmx) PRIVATE
michael@0: sym(vp8_get8x8var_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     push rsi
michael@0:     push rdi
michael@0:     push rbx
michael@0:     sub         rsp, 16
michael@0:     ; end prolog
michael@0: 
michael@0: 
michael@0:         pxor        mm5, mm5                    ; Blank mmx6
michael@0:         pxor        mm6, mm6                    ; Blank mmx7
michael@0:         pxor        mm7, mm7                    ; Blank mmx7
michael@0: 
michael@0:         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
michael@0:         mov         rbx, arg(2) ;[ref_ptr]
michael@0:         movsxd      rcx, dword ptr arg(1) ;[source_stride]
michael@0:         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
michael@0: 
michael@0:         ; Row 1
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0: 
michael@0:         ; Row 2
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 3
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 4
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 5
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         ;              movq        mm4, [rbx + rdx]
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 6
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 7
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 8
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm2, mm0                    ; Take copies
michael@0:         movq        mm3, mm1                    ; Take copies
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpckhbw   mm2, mm6                    ; unpack to higher prrcision
michael@0:         punpckhbw   mm3, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         psubsw      mm2, mm3                    ; A-B (high order) to MM2
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         paddw       mm5, mm2                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         pmaddwd     mm2, mm2                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0:         paddd       mm7, mm2                    ; accumulate in mm7
michael@0: 
michael@0:         ; Now accumulate the final results.
michael@0:         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
michael@0:         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
michael@0:         movsx       rdx, WORD PTR [rsp+8]
michael@0:         movsx       rcx, WORD PTR [rsp+10]
michael@0:         movsx       rbx, WORD PTR [rsp+12]
michael@0:         movsx       rax, WORD PTR [rsp+14]
michael@0:         add         rdx, rcx
michael@0:         add         rbx, rax
michael@0:         add         rdx, rbx    ;XSum
michael@0:         movsxd      rax, DWORD PTR [rsp]
michael@0:         movsxd      rcx, DWORD PTR [rsp+4]
michael@0:         add         rax, rcx    ;XXSum
michael@0:         mov         rsi, arg(4) ;SSE
michael@0:         mov         rdi, arg(5) ;Sum
michael@0:         mov         dword ptr [rsi], eax
michael@0:         mov         dword ptr [rdi], edx
michael@0:         xor         rax, rax    ; return 0
michael@0: 
michael@0: 
michael@0:     ; begin epilog
michael@0:     add rsp, 16
michael@0:     pop rbx
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: 
michael@0: ;unsigned int
michael@0: ;vp8_get4x4var_mmx
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int  source_stride,
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int  recon_stride,
michael@0: ;    unsigned int *SSE,
michael@0: ;    int *Sum
michael@0: ;)
michael@0: global sym(vp8_get4x4var_mmx) PRIVATE
michael@0: sym(vp8_get4x4var_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 6
michael@0:     push rsi
michael@0:     push rdi
michael@0:     push rbx
michael@0:     sub         rsp, 16
michael@0:     ; end prolog
michael@0: 
michael@0: 
michael@0:         pxor        mm5, mm5                    ; Blank mmx6
michael@0:         pxor        mm6, mm6                    ; Blank mmx7
michael@0:         pxor        mm7, mm7                    ; Blank mmx7
michael@0: 
michael@0:         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
michael@0:         mov         rbx, arg(2) ;[ref_ptr]
michael@0:         movsxd      rcx, dword ptr arg(1) ;[source_stride]
michael@0:         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
michael@0: 
michael@0:         ; Row 1
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0: 
michael@0:         ; Row 2
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 3
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movq        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 4
michael@0:         movq        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0: 
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0: 
michael@0:         paddw       mm5, mm0                    ; accumulate differences in mm5
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0: 
michael@0:         ; Now accumulate the final results.
michael@0:         movq        QWORD PTR [rsp+8], mm5      ; copy back accumulated results into normal memory
michael@0:         movq        QWORD PTR [rsp], mm7        ; copy back accumulated results into normal memory
michael@0:         movsx       rdx, WORD PTR [rsp+8]
michael@0:         movsx       rcx, WORD PTR [rsp+10]
michael@0:         movsx       rbx, WORD PTR [rsp+12]
michael@0:         movsx       rax, WORD PTR [rsp+14]
michael@0:         add         rdx, rcx
michael@0:         add         rbx, rax
michael@0:         add         rdx, rbx    ;XSum
michael@0:         movsxd      rax, DWORD PTR [rsp]
michael@0:         movsxd      rcx, DWORD PTR [rsp+4]
michael@0:         add         rax, rcx    ;XXSum
michael@0:         mov         rsi, arg(4) ;SSE
michael@0:         mov         rdi, arg(5) ;Sum
michael@0:         mov         dword ptr [rsi], eax
michael@0:         mov         dword ptr [rdi], edx
michael@0:         xor         rax, rax    ; return 0
michael@0: 
michael@0: 
michael@0:     ; begin epilog
michael@0:     add rsp, 16
michael@0:     pop rbx
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: 
michael@0: ;unsigned int
michael@0: ;vp8_get4x4sse_cs_mmx
michael@0: ;(
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int  source_stride,
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int  recon_stride
michael@0: ;)
michael@0: global sym(vp8_get4x4sse_cs_mmx) PRIVATE
michael@0: sym(vp8_get4x4sse_cs_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 4
michael@0:     push rsi
michael@0:     push rdi
michael@0:     push rbx
michael@0:     ; end prolog
michael@0: 
michael@0: 
michael@0:         pxor        mm6, mm6                    ; Blank mmx7
michael@0:         pxor        mm7, mm7                    ; Blank mmx7
michael@0: 
michael@0:         mov         rax, arg(0) ;[src_ptr]  ; Load base addresses
michael@0:         mov         rbx, arg(2) ;[ref_ptr]
michael@0:         movsxd      rcx, dword ptr arg(1) ;[source_stride]
michael@0:         movsxd      rdx, dword ptr arg(3) ;[recon_stride]
michael@0:         ; Row 1
michael@0:         movd        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 2
michael@0:         movd        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 3
michael@0:         movd        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         punpcklbw   mm1, mm6
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0: 
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         add         rbx,rdx                     ; Inc pointer into ref data
michael@0:         add         rax,rcx                     ; Inc pointer into the new data
michael@0:         movd        mm1, [rbx]                  ; Copy eight bytes to mm1
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0:         ; Row 4
michael@0:         movd        mm0, [rax]                  ; Copy eight bytes to mm0
michael@0:         punpcklbw   mm0, mm6                    ; unpack to higher prrcision
michael@0:         punpcklbw   mm1, mm6
michael@0:         psubsw      mm0, mm1                    ; A-B (low order) to MM0
michael@0:         pmaddwd     mm0, mm0                    ; square and accumulate
michael@0:         paddd       mm7, mm0                    ; accumulate in mm7
michael@0: 
michael@0:         movq        mm0,    mm7                 ;
michael@0:         psrlq       mm7,    32
michael@0: 
michael@0:         paddd       mm0,    mm7
michael@0:         movq        rax,    mm0
michael@0: 
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rbx
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: %define mmx_filter_shift            7
michael@0: 
michael@0: ;void vp8_filter_block2d_bil4x4_var_mmx
michael@0: ;(
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int ref_pixels_per_line,
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int src_pixels_per_line,
michael@0: ;    unsigned short *HFilter,
michael@0: ;    unsigned short *VFilter,
michael@0: ;    int *sum,
michael@0: ;    unsigned int *sumsquared
michael@0: ;)
michael@0: global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE
michael@0: sym(vp8_filter_block2d_bil4x4_var_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 8
michael@0:     GET_GOT     rbx
michael@0:     push rsi
michael@0:     push rdi
michael@0:     sub         rsp, 16
michael@0:     ; end prolog
michael@0: 
michael@0: 
michael@0:         pxor            mm6,            mm6                 ;
michael@0:         pxor            mm7,            mm7                 ;
michael@0: 
michael@0:         mov             rax,            arg(4) ;HFilter             ;
michael@0:         mov             rdx,            arg(5) ;VFilter             ;
michael@0: 
michael@0:         mov             rsi,            arg(0) ;ref_ptr              ;
michael@0:         mov             rdi,            arg(2) ;src_ptr              ;
michael@0: 
michael@0:         mov             rcx,            4                   ;
michael@0:         pxor            mm0,            mm0                 ;
michael@0: 
michael@0:         movd            mm1,            [rsi]               ;
michael@0:         movd            mm3,            [rsi+1]             ;
michael@0: 
michael@0:         punpcklbw       mm1,            mm0                 ;
michael@0:         pmullw          mm1,            [rax]               ;
michael@0: 
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0:         pmullw          mm3,            [rax+8]             ;
michael@0: 
michael@0:         paddw           mm1,            mm3                 ;
michael@0:         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
michael@0: 
michael@0:         psraw           mm1,            mmx_filter_shift    ;
michael@0:         movq            mm5,            mm1
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add             rsi, dword ptr  arg(1) ;ref_pixels_per_line    ;
michael@0: %else
michael@0:         movsxd          r8, dword ptr  arg(1) ;ref_pixels_per_line    ;
michael@0:         add             rsi, r8
michael@0: %endif
michael@0: 
michael@0: .filter_block2d_bil4x4_var_mmx_loop:
michael@0: 
michael@0:         movd            mm1,            [rsi]               ;
michael@0:         movd            mm3,            [rsi+1]             ;
michael@0: 
michael@0:         punpcklbw       mm1,            mm0                 ;
michael@0:         pmullw          mm1,            [rax]               ;
michael@0: 
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0:         pmullw          mm3,            [rax+8]             ;
michael@0: 
michael@0:         paddw           mm1,            mm3                 ;
michael@0:         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
michael@0: 
michael@0:         psraw           mm1,            mmx_filter_shift    ;
michael@0:         movq            mm3,            mm5                 ;
michael@0: 
michael@0:         movq            mm5,            mm1                 ;
michael@0:         pmullw          mm3,            [rdx]               ;
michael@0: 
michael@0:         pmullw          mm1,            [rdx+8]             ;
michael@0:         paddw           mm1,            mm3                 ;
michael@0: 
michael@0: 
michael@0:         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
michael@0:         psraw           mm1,            mmx_filter_shift    ;
michael@0: 
michael@0:         movd            mm3,            [rdi]               ;
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0: 
michael@0:         psubw           mm1,            mm3                 ;
michael@0:         paddw           mm6,            mm1                 ;
michael@0: 
michael@0:         pmaddwd         mm1,            mm1                 ;
michael@0:         paddd           mm7,            mm1                 ;
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
michael@0:         add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
michael@0: %else
michael@0:         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
michael@0:         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line
michael@0:         add             rsi,            r8
michael@0:         add             rdi,            r9
michael@0: %endif
michael@0:         sub             rcx,            1                   ;
michael@0:         jnz             .filter_block2d_bil4x4_var_mmx_loop       ;
michael@0: 
michael@0: 
michael@0:         pxor            mm3,            mm3                 ;
michael@0:         pxor            mm2,            mm2                 ;
michael@0: 
michael@0:         punpcklwd       mm2,            mm6                 ;
michael@0:         punpckhwd       mm3,            mm6                 ;
michael@0: 
michael@0:         paddd           mm2,            mm3                 ;
michael@0:         movq            mm6,            mm2                 ;
michael@0: 
michael@0:         psrlq           mm6,            32                  ;
michael@0:         paddd           mm2,            mm6                 ;
michael@0: 
michael@0:         psrad           mm2,            16                  ;
michael@0:         movq            mm4,            mm7                 ;
michael@0: 
michael@0:         psrlq           mm4,            32                  ;
michael@0:         paddd           mm4,            mm7                 ;
michael@0: 
michael@0:         mov             rdi,            arg(6) ;sum
michael@0:         mov             rsi,            arg(7) ;sumsquared
michael@0: 
michael@0:         movd            dword ptr [rdi],          mm2                 ;
michael@0:         movd            dword ptr [rsi],          mm4                 ;
michael@0: 
michael@0: 
michael@0: 
michael@0:     ; begin epilog
michael@0:     add rsp, 16
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: 
michael@0: 
michael@0: ;void vp8_filter_block2d_bil_var_mmx
michael@0: ;(
michael@0: ;    unsigned char *ref_ptr,
michael@0: ;    int ref_pixels_per_line,
michael@0: ;    unsigned char *src_ptr,
michael@0: ;    int src_pixels_per_line,
michael@0: ;    unsigned int Height,
michael@0: ;    unsigned short *HFilter,
michael@0: ;    unsigned short *VFilter,
michael@0: ;    int *sum,
michael@0: ;    unsigned int *sumsquared
michael@0: ;)
michael@0: global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE
michael@0: sym(vp8_filter_block2d_bil_var_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 9
michael@0:     GET_GOT     rbx
michael@0:     push rsi
michael@0:     push rdi
michael@0:     sub         rsp, 16
michael@0:     ; end prolog
michael@0: 
michael@0:         pxor            mm6,            mm6                 ;
michael@0:         pxor            mm7,            mm7                 ;
michael@0:         mov             rax,            arg(5) ;HFilter             ;
michael@0: 
michael@0:         mov             rdx,            arg(6) ;VFilter             ;
michael@0:         mov             rsi,            arg(0) ;ref_ptr              ;
michael@0: 
michael@0:         mov             rdi,            arg(2) ;src_ptr              ;
michael@0:         movsxd          rcx,            dword ptr arg(4) ;Height              ;
michael@0: 
michael@0:         pxor            mm0,            mm0                 ;
michael@0:         movq            mm1,            [rsi]               ;
michael@0: 
michael@0:         movq            mm3,            [rsi+1]             ;
michael@0:         movq            mm2,            mm1                 ;
michael@0: 
michael@0:         movq            mm4,            mm3                 ;
michael@0:         punpcklbw       mm1,            mm0                 ;
michael@0: 
michael@0:         punpckhbw       mm2,            mm0                 ;
michael@0:         pmullw          mm1,            [rax]               ;
michael@0: 
michael@0:         pmullw          mm2,            [rax]               ;
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0: 
michael@0:         punpckhbw       mm4,            mm0                 ;
michael@0:         pmullw          mm3,            [rax+8]             ;
michael@0: 
michael@0:         pmullw          mm4,            [rax+8]             ;
michael@0:         paddw           mm1,            mm3                 ;
michael@0: 
michael@0:         paddw           mm2,            mm4                 ;
michael@0:         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
michael@0: 
michael@0:         psraw           mm1,            mmx_filter_shift    ;
michael@0:         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
michael@0: 
michael@0:         psraw           mm2,            mmx_filter_shift    ;
michael@0:         movq            mm5,            mm1
michael@0: 
michael@0:         packuswb        mm5,            mm2                 ;
michael@0: %if ABI_IS_32BIT
michael@0:         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line
michael@0: %else
michael@0:         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line
michael@0:         add             rsi,            r8
michael@0: %endif
michael@0: 
michael@0: .filter_block2d_bil_var_mmx_loop:
michael@0: 
michael@0:         movq            mm1,            [rsi]               ;
michael@0:         movq            mm3,            [rsi+1]             ;
michael@0: 
michael@0:         movq            mm2,            mm1                 ;
michael@0:         movq            mm4,            mm3                 ;
michael@0: 
michael@0:         punpcklbw       mm1,            mm0                 ;
michael@0:         punpckhbw       mm2,            mm0                 ;
michael@0: 
michael@0:         pmullw          mm1,            [rax]               ;
michael@0:         pmullw          mm2,            [rax]               ;
michael@0: 
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0:         punpckhbw       mm4,            mm0                 ;
michael@0: 
michael@0:         pmullw          mm3,            [rax+8]             ;
michael@0:         pmullw          mm4,            [rax+8]             ;
michael@0: 
michael@0:         paddw           mm1,            mm3                 ;
michael@0:         paddw           mm2,            mm4                 ;
michael@0: 
michael@0:         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
michael@0:         psraw           mm1,            mmx_filter_shift    ;
michael@0: 
michael@0:         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
michael@0:         psraw           mm2,            mmx_filter_shift    ;
michael@0: 
michael@0:         movq            mm3,            mm5                 ;
michael@0:         movq            mm4,            mm5                 ;
michael@0: 
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0:         punpckhbw       mm4,            mm0                 ;
michael@0: 
michael@0:         movq            mm5,            mm1                 ;
michael@0:         packuswb        mm5,            mm2                 ;
michael@0: 
michael@0:         pmullw          mm3,            [rdx]               ;
michael@0:         pmullw          mm4,            [rdx]               ;
michael@0: 
michael@0:         pmullw          mm1,            [rdx+8]             ;
michael@0:         pmullw          mm2,            [rdx+8]             ;
michael@0: 
michael@0:         paddw           mm1,            mm3                 ;
michael@0:         paddw           mm2,            mm4                 ;
michael@0: 
michael@0:         paddw           mm1,            [GLOBAL(mmx_bi_rd)] ;
michael@0:         paddw           mm2,            [GLOBAL(mmx_bi_rd)] ;
michael@0: 
michael@0:         psraw           mm1,            mmx_filter_shift    ;
michael@0:         psraw           mm2,            mmx_filter_shift    ;
michael@0: 
michael@0:         movq            mm3,            [rdi]               ;
michael@0:         movq            mm4,            mm3                 ;
michael@0: 
michael@0:         punpcklbw       mm3,            mm0                 ;
michael@0:         punpckhbw       mm4,            mm0                 ;
michael@0: 
michael@0:         psubw           mm1,            mm3                 ;
michael@0:         psubw           mm2,            mm4                 ;
michael@0: 
michael@0:         paddw           mm6,            mm1                 ;
michael@0:         pmaddwd         mm1,            mm1                 ;
michael@0: 
michael@0:         paddw           mm6,            mm2                 ;
michael@0:         pmaddwd         mm2,            mm2                 ;
michael@0: 
michael@0:         paddd           mm7,            mm1                 ;
michael@0:         paddd           mm7,            mm2                 ;
michael@0: 
michael@0: %if ABI_IS_32BIT
michael@0:         add             rsi,            dword ptr arg(1) ;ref_pixels_per_line    ;
michael@0:         add             rdi,            dword ptr arg(3) ;src_pixels_per_line    ;
michael@0: %else
michael@0:         movsxd          r8,             dword ptr arg(1) ;ref_pixels_per_line    ;
michael@0:         movsxd          r9,             dword ptr arg(3) ;src_pixels_per_line    ;
michael@0:         add             rsi,            r8
michael@0:         add             rdi,            r9
michael@0: %endif
michael@0:         sub             rcx,            1                   ;
michael@0:         jnz             .filter_block2d_bil_var_mmx_loop       ;
michael@0: 
michael@0: 
michael@0:         pxor            mm3,            mm3                 ;
michael@0:         pxor            mm2,            mm2                 ;
michael@0: 
michael@0:         punpcklwd       mm2,            mm6                 ;
michael@0:         punpckhwd       mm3,            mm6                 ;
michael@0: 
michael@0:         paddd           mm2,            mm3                 ;
michael@0:         movq            mm6,            mm2                 ;
michael@0: 
michael@0:         psrlq           mm6,            32                  ;
michael@0:         paddd           mm2,            mm6                 ;
michael@0: 
michael@0:         psrad           mm2,            16                  ;
michael@0:         movq            mm4,            mm7                 ;
michael@0: 
michael@0:         psrlq           mm4,            32                  ;
michael@0:         paddd           mm4,            mm7                 ;
michael@0: 
michael@0:         mov             rdi,            arg(7) ;sum
michael@0:         mov             rsi,            arg(8) ;sumsquared
michael@0: 
michael@0:         movd            dword ptr [rdi],          mm2                 ;
michael@0:         movd            dword ptr [rsi],          mm4                 ;
michael@0: 
michael@0:     ; begin epilog
michael@0:     add rsp, 16
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: 
michael@0: SECTION_RODATA
michael@0: ;short mmx_bi_rd[4] = { 64, 64, 64, 64};
michael@0: align 16
michael@0: mmx_bi_rd:
michael@0:     times 4 dw 64