michael@0: ; michael@0: ; Copyright (c) 2012 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;void vp8_filter_by_weight16x16_sse2 michael@0: ;( michael@0: ; unsigned char *src, michael@0: ; int src_stride, michael@0: ; unsigned char *dst, michael@0: ; int dst_stride, michael@0: ; int src_weight michael@0: ;) michael@0: global sym(vp8_filter_by_weight16x16_sse2) PRIVATE michael@0: sym(vp8_filter_by_weight16x16_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: SAVE_XMM 6 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: movd xmm0, arg(4) ; src_weight michael@0: pshuflw xmm0, xmm0, 0x0 ; replicate to all low words michael@0: punpcklqdq xmm0, xmm0 ; replicate to all hi words michael@0: michael@0: movdqa xmm1, [GLOBAL(tMFQE)] michael@0: psubw xmm1, xmm0 ; dst_weight michael@0: michael@0: mov rax, arg(0) ; src michael@0: mov rsi, arg(1) ; src_stride michael@0: mov rdx, arg(2) ; dst michael@0: mov rdi, arg(3) ; dst_stride michael@0: michael@0: mov rcx, 16 ; loop count michael@0: pxor xmm6, xmm6 michael@0: michael@0: .combine michael@0: movdqa xmm2, [rax] michael@0: movdqa xmm4, [rdx] michael@0: add rax, rsi michael@0: michael@0: ; src * src_weight michael@0: movdqa xmm3, xmm2 michael@0: punpcklbw xmm2, xmm6 michael@0: punpckhbw xmm3, xmm6 michael@0: pmullw xmm2, xmm0 michael@0: pmullw xmm3, xmm0 michael@0: michael@0: ; dst * dst_weight michael@0: movdqa xmm5, xmm4 michael@0: punpcklbw xmm4, xmm6 michael@0: punpckhbw xmm5, xmm6 michael@0: pmullw xmm4, xmm1 michael@0: pmullw xmm5, xmm1 michael@0: michael@0: ; sum, round and shift michael@0: paddw xmm2, xmm4 michael@0: paddw xmm3, xmm5 michael@0: paddw xmm2, [GLOBAL(tMFQE_round)] michael@0: paddw xmm3, [GLOBAL(tMFQE_round)] michael@0: psrlw xmm2, 4 michael@0: psrlw xmm3, 4 michael@0: michael@0: packuswb xmm2, xmm3 michael@0: movdqa [rdx], xmm2 michael@0: add rdx, rdi michael@0: michael@0: dec rcx michael@0: jnz .combine michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: RESTORE_XMM michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: michael@0: ret michael@0: michael@0: ;void vp8_filter_by_weight8x8_sse2 michael@0: ;( michael@0: ; unsigned char *src, michael@0: ; int src_stride, michael@0: ; unsigned char *dst, michael@0: ; int dst_stride, michael@0: ; int src_weight michael@0: ;) michael@0: global sym(vp8_filter_by_weight8x8_sse2) PRIVATE michael@0: sym(vp8_filter_by_weight8x8_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 5 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: movd xmm0, arg(4) ; src_weight michael@0: pshuflw xmm0, xmm0, 0x0 ; replicate to all low words michael@0: punpcklqdq xmm0, xmm0 ; replicate to all hi words michael@0: michael@0: movdqa xmm1, [GLOBAL(tMFQE)] michael@0: psubw xmm1, xmm0 ; dst_weight michael@0: michael@0: mov rax, arg(0) ; src michael@0: mov rsi, arg(1) ; src_stride michael@0: mov rdx, arg(2) ; dst michael@0: mov rdi, arg(3) ; dst_stride michael@0: michael@0: mov rcx, 8 ; loop count michael@0: pxor xmm4, xmm4 michael@0: michael@0: .combine michael@0: movq xmm2, [rax] michael@0: movq xmm3, [rdx] michael@0: add rax, rsi michael@0: michael@0: ; src * src_weight michael@0: punpcklbw xmm2, xmm4 michael@0: pmullw xmm2, xmm0 michael@0: michael@0: ; dst * dst_weight michael@0: punpcklbw xmm3, xmm4 michael@0: pmullw xmm3, xmm1 michael@0: michael@0: ; sum, round and shift michael@0: paddw xmm2, xmm3 michael@0: paddw xmm2, [GLOBAL(tMFQE_round)] michael@0: psrlw xmm2, 4 michael@0: michael@0: packuswb xmm2, xmm4 michael@0: movq [rdx], xmm2 michael@0: add rdx, rdi michael@0: michael@0: dec rcx michael@0: jnz .combine michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: michael@0: ret michael@0: michael@0: ;void vp8_variance_and_sad_16x16_sse2 | arg michael@0: ;( michael@0: ; unsigned char *src1, 0 michael@0: ; int stride1, 1 michael@0: ; unsigned char *src2, 2 michael@0: ; int stride2, 3 michael@0: ; unsigned int *variance, 4 michael@0: ; unsigned int *sad, 5 michael@0: ;) michael@0: global sym(vp8_variance_and_sad_16x16_sse2) PRIVATE michael@0: sym(vp8_variance_and_sad_16x16_sse2): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 6 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rax, arg(0) ; src1 michael@0: mov rcx, arg(1) ; stride1 michael@0: mov rdx, arg(2) ; src2 michael@0: mov rdi, arg(3) ; stride2 michael@0: michael@0: mov rsi, 16 ; block height michael@0: michael@0: ; Prep accumulator registers michael@0: pxor xmm3, xmm3 ; SAD michael@0: pxor xmm4, xmm4 ; sum of src2 michael@0: pxor xmm5, xmm5 ; sum of src2^2 michael@0: michael@0: ; Because we're working with the actual output frames michael@0: ; we can't depend on any kind of data alignment. michael@0: .accumulate michael@0: movdqa xmm0, [rax] ; src1 michael@0: movdqa xmm1, [rdx] ; src2 michael@0: add rax, rcx ; src1 + stride1 michael@0: add rdx, rdi ; src2 + stride2 michael@0: michael@0: ; SAD(src1, src2) michael@0: psadbw xmm0, xmm1 michael@0: paddusw xmm3, xmm0 michael@0: michael@0: ; SUM(src2) michael@0: pxor xmm2, xmm2 michael@0: psadbw xmm2, xmm1 ; sum src2 by misusing SAD against 0 michael@0: paddusw xmm4, xmm2 michael@0: michael@0: ; pmaddubsw would be ideal if it took two unsigned values. instead, michael@0: ; it expects a signed and an unsigned value. so instead we zero extend michael@0: ; and operate on words. michael@0: pxor xmm2, xmm2 michael@0: movdqa xmm0, xmm1 michael@0: punpcklbw xmm0, xmm2 michael@0: punpckhbw xmm1, xmm2 michael@0: pmaddwd xmm0, xmm0 michael@0: pmaddwd xmm1, xmm1 michael@0: paddd xmm5, xmm0 michael@0: paddd xmm5, xmm1 michael@0: michael@0: sub rsi, 1 michael@0: jnz .accumulate michael@0: michael@0: ; phaddd only operates on adjacent double words. michael@0: ; Finalize SAD and store michael@0: movdqa xmm0, xmm3 michael@0: psrldq xmm0, 8 michael@0: paddusw xmm0, xmm3 michael@0: paddd xmm0, [GLOBAL(t128)] michael@0: psrld xmm0, 8 michael@0: michael@0: mov rax, arg(5) michael@0: movd [rax], xmm0 michael@0: michael@0: ; Accumulate sum of src2 michael@0: movdqa xmm0, xmm4 michael@0: psrldq xmm0, 8 michael@0: paddusw xmm0, xmm4 michael@0: ; Square src2. Ignore high value michael@0: pmuludq xmm0, xmm0 michael@0: psrld xmm0, 8 michael@0: michael@0: ; phaddw could be used to sum adjacent values but we want michael@0: ; all the values summed. promote to doubles, accumulate, michael@0: ; shift and sum michael@0: pxor xmm2, xmm2 michael@0: movdqa xmm1, xmm5 michael@0: punpckldq xmm1, xmm2 michael@0: punpckhdq xmm5, xmm2 michael@0: paddd xmm1, xmm5 michael@0: movdqa xmm2, xmm1 michael@0: psrldq xmm1, 8 michael@0: paddd xmm1, xmm2 michael@0: michael@0: psubd xmm1, xmm0 michael@0: michael@0: ; (variance + 128) >> 8 michael@0: paddd xmm1, [GLOBAL(t128)] michael@0: psrld xmm1, 8 michael@0: mov rax, arg(4) michael@0: michael@0: movd [rax], xmm1 michael@0: michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: SECTION_RODATA michael@0: align 16 michael@0: t128: michael@0: %ifndef __NASM_VER__ michael@0: ddq 128 michael@0: %elif CONFIG_BIG_ENDIAN michael@0: dq 0, 128 michael@0: %else michael@0: dq 128, 0 michael@0: %endif michael@0: align 16 michael@0: tMFQE: ; 1 << MFQE_PRECISION michael@0: times 8 dw 0x10 michael@0: align 16 michael@0: tMFQE_round: ; 1 << (MFQE_PRECISION - 1) michael@0: times 8 dw 0x08 michael@0: