michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;unsigned int vp9_get_mb_ss_mmx( short *src_ptr ) michael@0: global sym(vp9_get_mb_ss_mmx) PRIVATE michael@0: sym(vp9_get_mb_ss_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: sub rsp, 8 michael@0: ; end prolog michael@0: michael@0: mov rax, arg(0) ;src_ptr michael@0: mov rcx, 16 michael@0: pxor mm4, mm4 michael@0: michael@0: .NEXTROW: michael@0: movq mm0, [rax] michael@0: movq mm1, [rax+8] michael@0: movq mm2, [rax+16] michael@0: movq mm3, [rax+24] michael@0: pmaddwd mm0, mm0 michael@0: pmaddwd mm1, mm1 michael@0: pmaddwd mm2, mm2 michael@0: pmaddwd mm3, mm3 michael@0: michael@0: paddd mm4, mm0 michael@0: paddd mm4, mm1 michael@0: paddd mm4, mm2 michael@0: paddd mm4, mm3 michael@0: michael@0: add rax, 32 michael@0: dec rcx michael@0: ja .NEXTROW michael@0: movq QWORD PTR [rsp], mm4 michael@0: michael@0: ;return sum[0]+sum[1]; michael@0: movsxd rax, dword ptr [rsp] michael@0: movsxd rcx, dword ptr [rsp+4] michael@0: add rax, rcx michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 8 michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;unsigned int vp9_get8x8var_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int recon_stride, michael@0: ; unsigned int *SSE, michael@0: ; int *Sum michael@0: ;) michael@0: global sym(vp9_get8x8var_mmx) PRIVATE michael@0: sym(vp9_get8x8var_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm5, mm5 ; Blank mmx6 michael@0: pxor mm6, mm6 ; Blank mmx7 michael@0: pxor mm7, mm7 ; Blank mmx7 michael@0: michael@0: mov rax, arg(0) ;[src_ptr] ; Load base addresses michael@0: mov rbx, arg(2) ;[ref_ptr] michael@0: movsxd rcx, dword ptr arg(1) ;[source_stride] michael@0: movsxd rdx, dword ptr arg(3) ;[recon_stride] michael@0: michael@0: ; Row 1 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: michael@0: ; Row 2 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 3 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 4 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 5 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: ; movq mm4, [rbx + rdx] michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 6 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 7 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 8 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Now accumulate the final results. michael@0: movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory michael@0: movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory michael@0: movsx rdx, WORD PTR [rsp+8] michael@0: movsx rcx, WORD PTR [rsp+10] michael@0: movsx rbx, WORD PTR [rsp+12] michael@0: movsx rax, WORD PTR [rsp+14] michael@0: add rdx, rcx michael@0: add rbx, rax michael@0: add rdx, rbx ;XSum michael@0: movsxd rax, DWORD PTR [rsp] michael@0: movsxd rcx, DWORD PTR [rsp+4] michael@0: add rax, rcx ;XXSum michael@0: mov rsi, arg(4) ;SSE michael@0: mov rdi, arg(5) ;Sum michael@0: mov dword ptr [rsi], eax michael@0: mov dword ptr [rdi], edx michael@0: xor rax, rax ; return 0 michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: ;unsigned int michael@0: ;vp9_get4x4var_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int recon_stride, michael@0: ; unsigned int *SSE, michael@0: ; int *Sum michael@0: ;) michael@0: global sym(vp9_get4x4var_mmx) PRIVATE michael@0: sym(vp9_get4x4var_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm5, mm5 ; Blank mmx6 michael@0: pxor mm6, mm6 ; Blank mmx7 michael@0: pxor mm7, mm7 ; Blank mmx7 michael@0: michael@0: mov rax, arg(0) ;[src_ptr] ; Load base addresses michael@0: mov rbx, arg(2) ;[ref_ptr] michael@0: movsxd rcx, dword ptr arg(1) ;[source_stride] michael@0: movsxd rdx, dword ptr arg(3) ;[recon_stride] michael@0: michael@0: ; Row 1 michael@0: movd mm0, [rax] ; Copy 4 bytes to mm0 michael@0: movd mm1, [rbx] ; Copy 4 bytes to mm1 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy 4 bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: michael@0: ; Row 2 michael@0: movd mm0, [rax] ; Copy 4 bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy 4 bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 3 michael@0: movd mm0, [rax] ; Copy 4 bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy 4 bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 4 michael@0: movd mm0, [rax] ; Copy 4 bytes to mm0 michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: michael@0: ; Now accumulate the final results. michael@0: movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory michael@0: movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory michael@0: movsx rdx, WORD PTR [rsp+8] michael@0: movsx rcx, WORD PTR [rsp+10] michael@0: movsx rbx, WORD PTR [rsp+12] michael@0: movsx rax, WORD PTR [rsp+14] michael@0: add rdx, rcx michael@0: add rbx, rax michael@0: add rdx, rbx ;XSum michael@0: movsxd rax, DWORD PTR [rsp] michael@0: movsxd rcx, DWORD PTR [rsp+4] michael@0: add rax, rcx ;XXSum michael@0: mov rsi, arg(4) ;SSE michael@0: mov rdi, arg(5) ;Sum michael@0: mov dword ptr [rsi], eax michael@0: mov dword ptr [rdi], edx michael@0: xor rax, rax ; return 0 michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: ;unsigned int michael@0: ;vp9_get4x4sse_cs_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int recon_stride michael@0: ;) michael@0: global sym(vp9_get4x4sse_cs_mmx) PRIVATE michael@0: sym(vp9_get4x4sse_cs_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 4 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm6, mm6 ; Blank mmx7 michael@0: pxor mm7, mm7 ; Blank mmx7 michael@0: michael@0: mov rax, arg(0) ;[src_ptr] ; Load base addresses michael@0: mov rbx, arg(2) ;[ref_ptr] michael@0: movsxd rcx, dword ptr arg(1) ;[source_stride] michael@0: movsxd rdx, dword ptr arg(3) ;[recon_stride] michael@0: ; Row 1 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 2 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 3 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm1, mm6 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 4 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: movq mm0, mm7 ; michael@0: psrlq mm7, 32 michael@0: michael@0: paddd mm0, mm7 michael@0: movq rax, mm0 michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret