michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) michael@0: global sym(vp8_get_mb_ss_mmx) PRIVATE michael@0: sym(vp8_get_mb_ss_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 7 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: sub rsp, 8 michael@0: ; end prolog michael@0: michael@0: mov rax, arg(0) ;src_ptr michael@0: mov rcx, 16 michael@0: pxor mm4, mm4 michael@0: michael@0: .NEXTROW: michael@0: movq mm0, [rax] michael@0: movq mm1, [rax+8] michael@0: movq mm2, [rax+16] michael@0: movq mm3, [rax+24] michael@0: pmaddwd mm0, mm0 michael@0: pmaddwd mm1, mm1 michael@0: pmaddwd mm2, mm2 michael@0: pmaddwd mm3, mm3 michael@0: michael@0: paddd mm4, mm0 michael@0: paddd mm4, mm1 michael@0: paddd mm4, mm2 michael@0: paddd mm4, mm3 michael@0: michael@0: add rax, 32 michael@0: dec rcx michael@0: ja .NEXTROW michael@0: movq QWORD PTR [rsp], mm4 michael@0: michael@0: ;return sum[0]+sum[1]; michael@0: movsxd rax, dword ptr [rsp] michael@0: movsxd rcx, dword ptr [rsp+4] michael@0: add rax, rcx michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 8 michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: ;unsigned int vp8_get8x8var_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int recon_stride, michael@0: ; unsigned int *SSE, michael@0: ; int *Sum michael@0: ;) michael@0: global sym(vp8_get8x8var_mmx) PRIVATE michael@0: sym(vp8_get8x8var_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm5, mm5 ; Blank mmx6 michael@0: pxor mm6, mm6 ; Blank mmx7 michael@0: pxor mm7, mm7 ; Blank mmx7 michael@0: michael@0: mov rax, arg(0) ;[src_ptr] ; Load base addresses michael@0: mov rbx, arg(2) ;[ref_ptr] michael@0: movsxd rcx, dword ptr arg(1) ;[source_stride] michael@0: movsxd rdx, dword ptr arg(3) ;[recon_stride] michael@0: michael@0: ; Row 1 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: michael@0: ; Row 2 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 3 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 4 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 5 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: ; movq mm4, [rbx + rdx] michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 6 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 7 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Row 8 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm2, mm0 ; Take copies michael@0: movq mm3, mm1 ; Take copies michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: punpckhbw mm2, mm6 ; unpack to higher prrcision michael@0: punpckhbw mm3, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: psubsw mm2, mm3 ; A-B (high order) to MM2 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: paddw mm5, mm2 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: pmaddwd mm2, mm2 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: paddd mm7, mm2 ; accumulate in mm7 michael@0: michael@0: ; Now accumulate the final results. michael@0: movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory michael@0: movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory michael@0: movsx rdx, WORD PTR [rsp+8] michael@0: movsx rcx, WORD PTR [rsp+10] michael@0: movsx rbx, WORD PTR [rsp+12] michael@0: movsx rax, WORD PTR [rsp+14] michael@0: add rdx, rcx michael@0: add rbx, rax michael@0: add rdx, rbx ;XSum michael@0: movsxd rax, DWORD PTR [rsp] michael@0: movsxd rcx, DWORD PTR [rsp+4] michael@0: add rax, rcx ;XXSum michael@0: mov rsi, arg(4) ;SSE michael@0: mov rdi, arg(5) ;Sum michael@0: mov dword ptr [rsi], eax michael@0: mov dword ptr [rdi], edx michael@0: xor rax, rax ; return 0 michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: ;unsigned int michael@0: ;vp8_get4x4var_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int recon_stride, michael@0: ; unsigned int *SSE, michael@0: ; int *Sum michael@0: ;) michael@0: global sym(vp8_get4x4var_mmx) PRIVATE michael@0: sym(vp8_get4x4var_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm5, mm5 ; Blank mmx6 michael@0: pxor mm6, mm6 ; Blank mmx7 michael@0: pxor mm7, mm7 ; Blank mmx7 michael@0: michael@0: mov rax, arg(0) ;[src_ptr] ; Load base addresses michael@0: mov rbx, arg(2) ;[ref_ptr] michael@0: movsxd rcx, dword ptr arg(1) ;[source_stride] michael@0: movsxd rdx, dword ptr arg(3) ;[recon_stride] michael@0: michael@0: ; Row 1 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: michael@0: ; Row 2 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 3 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movq mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 4 michael@0: movq mm0, [rax] ; Copy eight bytes to mm0 michael@0: michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: michael@0: paddw mm5, mm0 ; accumulate differences in mm5 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: michael@0: ; Now accumulate the final results. michael@0: movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory michael@0: movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory michael@0: movsx rdx, WORD PTR [rsp+8] michael@0: movsx rcx, WORD PTR [rsp+10] michael@0: movsx rbx, WORD PTR [rsp+12] michael@0: movsx rax, WORD PTR [rsp+14] michael@0: add rdx, rcx michael@0: add rbx, rax michael@0: add rdx, rbx ;XSum michael@0: movsxd rax, DWORD PTR [rsp] michael@0: movsxd rcx, DWORD PTR [rsp+4] michael@0: add rax, rcx ;XXSum michael@0: mov rsi, arg(4) ;SSE michael@0: mov rdi, arg(5) ;Sum michael@0: mov dword ptr [rsi], eax michael@0: mov dword ptr [rdi], edx michael@0: xor rax, rax ; return 0 michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: ;unsigned int michael@0: ;vp8_get4x4sse_cs_mmx michael@0: ;( michael@0: ; unsigned char *src_ptr, michael@0: ; int source_stride, michael@0: ; unsigned char *ref_ptr, michael@0: ; int recon_stride michael@0: ;) michael@0: global sym(vp8_get4x4sse_cs_mmx) PRIVATE michael@0: sym(vp8_get4x4sse_cs_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 4 michael@0: push rsi michael@0: push rdi michael@0: push rbx michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm6, mm6 ; Blank mmx7 michael@0: pxor mm7, mm7 ; Blank mmx7 michael@0: michael@0: mov rax, arg(0) ;[src_ptr] ; Load base addresses michael@0: mov rbx, arg(2) ;[ref_ptr] michael@0: movsxd rcx, dword ptr arg(1) ;[source_stride] michael@0: movsxd rdx, dword ptr arg(3) ;[recon_stride] michael@0: ; Row 1 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 2 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 3 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm1, mm6 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: add rbx,rdx ; Inc pointer into ref data michael@0: add rax,rcx ; Inc pointer into the new data michael@0: movd mm1, [rbx] ; Copy eight bytes to mm1 michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: ; Row 4 michael@0: movd mm0, [rax] ; Copy eight bytes to mm0 michael@0: punpcklbw mm0, mm6 ; unpack to higher prrcision michael@0: punpcklbw mm1, mm6 michael@0: psubsw mm0, mm1 ; A-B (low order) to MM0 michael@0: pmaddwd mm0, mm0 ; square and accumulate michael@0: paddd mm7, mm0 ; accumulate in mm7 michael@0: michael@0: movq mm0, mm7 ; michael@0: psrlq mm7, 32 michael@0: michael@0: paddd mm0, mm7 michael@0: movq rax, mm0 michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rbx michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: %define mmx_filter_shift 7 michael@0: michael@0: ;void vp8_filter_block2d_bil4x4_var_mmx michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned short *HFilter, michael@0: ; unsigned short *VFilter, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE michael@0: sym(vp8_filter_block2d_bil4x4_var_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 8 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: michael@0: pxor mm6, mm6 ; michael@0: pxor mm7, mm7 ; michael@0: michael@0: mov rax, arg(4) ;HFilter ; michael@0: mov rdx, arg(5) ;VFilter ; michael@0: michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: michael@0: mov rcx, 4 ; michael@0: pxor mm0, mm0 ; michael@0: michael@0: movd mm1, [rsi] ; michael@0: movd mm3, [rsi+1] ; michael@0: michael@0: punpcklbw mm1, mm0 ; michael@0: pmullw mm1, [rax] ; michael@0: michael@0: punpcklbw mm3, mm0 ; michael@0: pmullw mm3, [rax+8] ; michael@0: michael@0: paddw mm1, mm3 ; michael@0: paddw mm1, [GLOBAL(mmx_bi_rd)] ; michael@0: michael@0: psraw mm1, mmx_filter_shift ; michael@0: movq mm5, mm1 michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line ; michael@0: %else michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; michael@0: add rsi, r8 michael@0: %endif michael@0: michael@0: .filter_block2d_bil4x4_var_mmx_loop: michael@0: michael@0: movd mm1, [rsi] ; michael@0: movd mm3, [rsi+1] ; michael@0: michael@0: punpcklbw mm1, mm0 ; michael@0: pmullw mm1, [rax] ; michael@0: michael@0: punpcklbw mm3, mm0 ; michael@0: pmullw mm3, [rax+8] ; michael@0: michael@0: paddw mm1, mm3 ; michael@0: paddw mm1, [GLOBAL(mmx_bi_rd)] ; michael@0: michael@0: psraw mm1, mmx_filter_shift ; michael@0: movq mm3, mm5 ; michael@0: michael@0: movq mm5, mm1 ; michael@0: pmullw mm3, [rdx] ; michael@0: michael@0: pmullw mm1, [rdx+8] ; michael@0: paddw mm1, mm3 ; michael@0: michael@0: michael@0: paddw mm1, [GLOBAL(mmx_bi_rd)] ; michael@0: psraw mm1, mmx_filter_shift ; michael@0: michael@0: movd mm3, [rdi] ; michael@0: punpcklbw mm3, mm0 ; michael@0: michael@0: psubw mm1, mm3 ; michael@0: paddw mm6, mm1 ; michael@0: michael@0: pmaddwd mm1, mm1 ; michael@0: paddd mm7, mm1 ; michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line ; michael@0: add rdi, dword ptr arg(3) ;src_pixels_per_line ; michael@0: %else michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line michael@0: add rsi, r8 michael@0: add rdi, r9 michael@0: %endif michael@0: sub rcx, 1 ; michael@0: jnz .filter_block2d_bil4x4_var_mmx_loop ; michael@0: michael@0: michael@0: pxor mm3, mm3 ; michael@0: pxor mm2, mm2 ; michael@0: michael@0: punpcklwd mm2, mm6 ; michael@0: punpckhwd mm3, mm6 ; michael@0: michael@0: paddd mm2, mm3 ; michael@0: movq mm6, mm2 ; michael@0: michael@0: psrlq mm6, 32 ; michael@0: paddd mm2, mm6 ; michael@0: michael@0: psrad mm2, 16 ; michael@0: movq mm4, mm7 ; michael@0: michael@0: psrlq mm4, 32 ; michael@0: paddd mm4, mm7 ; michael@0: michael@0: mov rdi, arg(6) ;sum michael@0: mov rsi, arg(7) ;sumsquared michael@0: michael@0: movd dword ptr [rdi], mm2 ; michael@0: movd dword ptr [rsi], mm4 ; michael@0: michael@0: michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: michael@0: michael@0: ;void vp8_filter_block2d_bil_var_mmx michael@0: ;( michael@0: ; unsigned char *ref_ptr, michael@0: ; int ref_pixels_per_line, michael@0: ; unsigned char *src_ptr, michael@0: ; int src_pixels_per_line, michael@0: ; unsigned int Height, michael@0: ; unsigned short *HFilter, michael@0: ; unsigned short *VFilter, michael@0: ; int *sum, michael@0: ; unsigned int *sumsquared michael@0: ;) michael@0: global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE michael@0: sym(vp8_filter_block2d_bil_var_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 9 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: sub rsp, 16 michael@0: ; end prolog michael@0: michael@0: pxor mm6, mm6 ; michael@0: pxor mm7, mm7 ; michael@0: mov rax, arg(5) ;HFilter ; michael@0: michael@0: mov rdx, arg(6) ;VFilter ; michael@0: mov rsi, arg(0) ;ref_ptr ; michael@0: michael@0: mov rdi, arg(2) ;src_ptr ; michael@0: movsxd rcx, dword ptr arg(4) ;Height ; michael@0: michael@0: pxor mm0, mm0 ; michael@0: movq mm1, [rsi] ; michael@0: michael@0: movq mm3, [rsi+1] ; michael@0: movq mm2, mm1 ; michael@0: michael@0: movq mm4, mm3 ; michael@0: punpcklbw mm1, mm0 ; michael@0: michael@0: punpckhbw mm2, mm0 ; michael@0: pmullw mm1, [rax] ; michael@0: michael@0: pmullw mm2, [rax] ; michael@0: punpcklbw mm3, mm0 ; michael@0: michael@0: punpckhbw mm4, mm0 ; michael@0: pmullw mm3, [rax+8] ; michael@0: michael@0: pmullw mm4, [rax+8] ; michael@0: paddw mm1, mm3 ; michael@0: michael@0: paddw mm2, mm4 ; michael@0: paddw mm1, [GLOBAL(mmx_bi_rd)] ; michael@0: michael@0: psraw mm1, mmx_filter_shift ; michael@0: paddw mm2, [GLOBAL(mmx_bi_rd)] ; michael@0: michael@0: psraw mm2, mmx_filter_shift ; michael@0: movq mm5, mm1 michael@0: michael@0: packuswb mm5, mm2 ; michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line michael@0: %else michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line michael@0: add rsi, r8 michael@0: %endif michael@0: michael@0: .filter_block2d_bil_var_mmx_loop: michael@0: michael@0: movq mm1, [rsi] ; michael@0: movq mm3, [rsi+1] ; michael@0: michael@0: movq mm2, mm1 ; michael@0: movq mm4, mm3 ; michael@0: michael@0: punpcklbw mm1, mm0 ; michael@0: punpckhbw mm2, mm0 ; michael@0: michael@0: pmullw mm1, [rax] ; michael@0: pmullw mm2, [rax] ; michael@0: michael@0: punpcklbw mm3, mm0 ; michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: pmullw mm3, [rax+8] ; michael@0: pmullw mm4, [rax+8] ; michael@0: michael@0: paddw mm1, mm3 ; michael@0: paddw mm2, mm4 ; michael@0: michael@0: paddw mm1, [GLOBAL(mmx_bi_rd)] ; michael@0: psraw mm1, mmx_filter_shift ; michael@0: michael@0: paddw mm2, [GLOBAL(mmx_bi_rd)] ; michael@0: psraw mm2, mmx_filter_shift ; michael@0: michael@0: movq mm3, mm5 ; michael@0: movq mm4, mm5 ; michael@0: michael@0: punpcklbw mm3, mm0 ; michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: movq mm5, mm1 ; michael@0: packuswb mm5, mm2 ; michael@0: michael@0: pmullw mm3, [rdx] ; michael@0: pmullw mm4, [rdx] ; michael@0: michael@0: pmullw mm1, [rdx+8] ; michael@0: pmullw mm2, [rdx+8] ; michael@0: michael@0: paddw mm1, mm3 ; michael@0: paddw mm2, mm4 ; michael@0: michael@0: paddw mm1, [GLOBAL(mmx_bi_rd)] ; michael@0: paddw mm2, [GLOBAL(mmx_bi_rd)] ; michael@0: michael@0: psraw mm1, mmx_filter_shift ; michael@0: psraw mm2, mmx_filter_shift ; michael@0: michael@0: movq mm3, [rdi] ; michael@0: movq mm4, mm3 ; michael@0: michael@0: punpcklbw mm3, mm0 ; michael@0: punpckhbw mm4, mm0 ; michael@0: michael@0: psubw mm1, mm3 ; michael@0: psubw mm2, mm4 ; michael@0: michael@0: paddw mm6, mm1 ; michael@0: pmaddwd mm1, mm1 ; michael@0: michael@0: paddw mm6, mm2 ; michael@0: pmaddwd mm2, mm2 ; michael@0: michael@0: paddd mm7, mm1 ; michael@0: paddd mm7, mm2 ; michael@0: michael@0: %if ABI_IS_32BIT michael@0: add rsi, dword ptr arg(1) ;ref_pixels_per_line ; michael@0: add rdi, dword ptr arg(3) ;src_pixels_per_line ; michael@0: %else michael@0: movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; michael@0: movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; michael@0: add rsi, r8 michael@0: add rdi, r9 michael@0: %endif michael@0: sub rcx, 1 ; michael@0: jnz .filter_block2d_bil_var_mmx_loop ; michael@0: michael@0: michael@0: pxor mm3, mm3 ; michael@0: pxor mm2, mm2 ; michael@0: michael@0: punpcklwd mm2, mm6 ; michael@0: punpckhwd mm3, mm6 ; michael@0: michael@0: paddd mm2, mm3 ; michael@0: movq mm6, mm2 ; michael@0: michael@0: psrlq mm6, 32 ; michael@0: paddd mm2, mm6 ; michael@0: michael@0: psrad mm2, 16 ; michael@0: movq mm4, mm7 ; michael@0: michael@0: psrlq mm4, 32 ; michael@0: paddd mm4, mm7 ; michael@0: michael@0: mov rdi, arg(7) ;sum michael@0: mov rsi, arg(8) ;sumsquared michael@0: michael@0: movd dword ptr [rdi], mm2 ; michael@0: movd dword ptr [rsi], mm4 ; michael@0: michael@0: ; begin epilog michael@0: add rsp, 16 michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: michael@0: SECTION_RODATA michael@0: ;short mmx_bi_rd[4] = { 64, 64, 64, 64}; michael@0: align 16 michael@0: mmx_bi_rd: michael@0: times 4 dw 64