michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license and patent michael@0: ; grant that can be found in the LICENSE file in the root of the source michael@0: ; tree. All contributing project authors may be found in the AUTHORS michael@0: ; file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: %include "vp8_asm_enc_offsets.asm" michael@0: michael@0: michael@0: ; void vp8_fast_quantize_b_ssse3 | arg michael@0: ; (BLOCK *b, | 0 michael@0: ; BLOCKD *d) | 1 michael@0: ; michael@0: michael@0: global sym(vp8_fast_quantize_b_ssse3) PRIVATE michael@0: sym(vp8_fast_quantize_b_ssse3): michael@0: push rbp michael@0: mov rbp, rsp michael@0: GET_GOT rbx michael@0: michael@0: %if ABI_IS_32BIT michael@0: push rdi michael@0: push rsi michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: push rdi michael@0: push rsi michael@0: %endif michael@0: %endif michael@0: ; end prolog michael@0: michael@0: %if ABI_IS_32BIT michael@0: mov rdi, arg(0) ; BLOCK *b michael@0: mov rsi, arg(1) ; BLOCKD *d michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: mov rdi, rcx ; BLOCK *b michael@0: mov rsi, rdx ; BLOCKD *d michael@0: %else michael@0: ;mov rdi, rdi ; BLOCK *b michael@0: ;mov rsi, rsi ; BLOCKD *d michael@0: %endif michael@0: %endif michael@0: michael@0: mov rax, [rdi + vp8_block_coeff] michael@0: mov rcx, [rdi + vp8_block_round] michael@0: mov rdx, [rdi + vp8_block_quant_fast] michael@0: michael@0: ; coeff michael@0: movdqa xmm0, [rax] michael@0: movdqa xmm4, [rax + 16] michael@0: michael@0: ; round michael@0: movdqa xmm2, [rcx] michael@0: movdqa xmm3, [rcx + 16] michael@0: michael@0: movdqa xmm1, xmm0 michael@0: movdqa xmm5, xmm4 michael@0: michael@0: ; sz = z >> 15 michael@0: psraw xmm0, 15 michael@0: psraw xmm4, 15 michael@0: michael@0: pabsw xmm1, xmm1 michael@0: pabsw xmm5, xmm5 michael@0: michael@0: paddw xmm1, xmm2 michael@0: paddw xmm5, xmm3 michael@0: michael@0: ; quant_fast michael@0: pmulhw xmm1, [rdx] michael@0: pmulhw xmm5, [rdx + 16] michael@0: michael@0: mov rax, [rsi + vp8_blockd_qcoeff] michael@0: mov rdi, [rsi + vp8_blockd_dequant] michael@0: mov rcx, [rsi + vp8_blockd_dqcoeff] michael@0: michael@0: movdqa xmm2, xmm1 ;store y for getting eob michael@0: movdqa xmm3, xmm5 michael@0: michael@0: pxor xmm1, xmm0 michael@0: pxor xmm5, xmm4 michael@0: psubw xmm1, xmm0 michael@0: psubw xmm5, xmm4 michael@0: michael@0: movdqa [rax], xmm1 michael@0: movdqa [rax + 16], xmm5 michael@0: michael@0: movdqa xmm0, [rdi] michael@0: movdqa xmm4, [rdi + 16] michael@0: michael@0: pmullw xmm0, xmm1 michael@0: pmullw xmm4, xmm5 michael@0: pxor xmm1, xmm1 michael@0: michael@0: pcmpgtw xmm2, xmm1 ;calculate eob michael@0: pcmpgtw xmm3, xmm1 michael@0: packsswb xmm2, xmm3 michael@0: pshufb xmm2, [GLOBAL(zz_shuf)] michael@0: michael@0: pmovmskb edx, xmm2 michael@0: michael@0: movdqa [rcx], xmm0 ;store dqcoeff michael@0: movdqa [rcx + 16], xmm4 ;store dqcoeff michael@0: mov rcx, [rsi + vp8_blockd_eob] michael@0: michael@0: bsr eax, edx ;count 0 michael@0: add eax, 1 michael@0: michael@0: cmp edx, 0 ;if all 0, eob=0 michael@0: cmove eax, edx michael@0: michael@0: mov BYTE PTR [rcx], al ;store eob michael@0: michael@0: ; begin epilog michael@0: %if ABI_IS_32BIT michael@0: pop rsi michael@0: pop rdi michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: pop rsi michael@0: pop rdi michael@0: %endif michael@0: %endif michael@0: michael@0: RESTORE_GOT michael@0: pop rbp michael@0: ret michael@0: michael@0: SECTION_RODATA michael@0: align 16 michael@0: zz_shuf: michael@0: db 0, 1, 4, 8, 5, 2, 3, 6, 9, 12, 13, 10, 7, 11, 14, 15