michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, michael@0: ; short *qcoeff_ptr,short *dequant_ptr, michael@0: ; short *scan_mask, short *round_ptr, michael@0: ; short *quant_ptr, short *dqcoeff_ptr); michael@0: global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE michael@0: sym(vp8_fast_quantize_b_impl_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 8 michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: michael@0: mov rsi, arg(0) ;coeff_ptr michael@0: movq mm0, [rsi] michael@0: michael@0: mov rax, arg(1) ;zbin_ptr michael@0: movq mm1, [rax] michael@0: michael@0: movq mm3, mm0 michael@0: psraw mm0, 15 michael@0: michael@0: pxor mm3, mm0 michael@0: psubw mm3, mm0 ; abs michael@0: michael@0: movq mm2, mm3 michael@0: pcmpgtw mm1, mm2 michael@0: michael@0: pandn mm1, mm2 michael@0: movq mm3, mm1 michael@0: michael@0: mov rdx, arg(6) ;quant_ptr michael@0: movq mm1, [rdx] michael@0: michael@0: mov rcx, arg(5) ;round_ptr michael@0: movq mm2, [rcx] michael@0: michael@0: paddw mm3, mm2 michael@0: pmulhuw mm3, mm1 michael@0: michael@0: pxor mm3, mm0 michael@0: psubw mm3, mm0 ;gain the sign back michael@0: michael@0: mov rdi, arg(2) ;qcoeff_ptr michael@0: movq mm0, mm3 michael@0: michael@0: movq [rdi], mm3 michael@0: michael@0: mov rax, arg(3) ;dequant_ptr michael@0: movq mm2, [rax] michael@0: michael@0: pmullw mm3, mm2 michael@0: mov rax, arg(7) ;dqcoeff_ptr michael@0: michael@0: movq [rax], mm3 michael@0: michael@0: ; next 8 michael@0: movq mm4, [rsi+8] michael@0: michael@0: mov rax, arg(1) ;zbin_ptr michael@0: movq mm5, [rax+8] michael@0: michael@0: movq mm7, mm4 michael@0: psraw mm4, 15 michael@0: michael@0: pxor mm7, mm4 michael@0: psubw mm7, mm4 ; abs michael@0: michael@0: movq mm6, mm7 michael@0: pcmpgtw mm5, mm6 michael@0: michael@0: pandn mm5, mm6 michael@0: movq mm7, mm5 michael@0: michael@0: movq mm5, [rdx+8] michael@0: movq mm6, [rcx+8] michael@0: michael@0: paddw mm7, mm6 michael@0: pmulhuw mm7, mm5 michael@0: michael@0: pxor mm7, mm4 michael@0: psubw mm7, mm4;gain the sign back michael@0: michael@0: mov rdi, arg(2) ;qcoeff_ptr michael@0: michael@0: movq mm1, mm7 michael@0: movq [rdi+8], mm7 michael@0: michael@0: mov rax, arg(3) ;dequant_ptr michael@0: movq mm6, [rax+8] michael@0: michael@0: pmullw mm7, mm6 michael@0: mov rax, arg(7) ;dqcoeff_ptr michael@0: michael@0: movq [rax+8], mm7 michael@0: michael@0: michael@0: ; next 8 michael@0: movq mm4, [rsi+16] michael@0: michael@0: mov rax, arg(1) ;zbin_ptr michael@0: movq mm5, [rax+16] michael@0: michael@0: movq mm7, mm4 michael@0: psraw mm4, 15 michael@0: michael@0: pxor mm7, mm4 michael@0: psubw mm7, mm4 ; abs michael@0: michael@0: movq mm6, mm7 michael@0: pcmpgtw mm5, mm6 michael@0: michael@0: pandn mm5, mm6 michael@0: movq mm7, mm5 michael@0: michael@0: movq mm5, [rdx+16] michael@0: movq mm6, [rcx+16] michael@0: michael@0: paddw mm7, mm6 michael@0: pmulhuw mm7, mm5 michael@0: michael@0: pxor mm7, mm4 michael@0: psubw mm7, mm4;gain the sign back michael@0: michael@0: mov rdi, arg(2) ;qcoeff_ptr michael@0: michael@0: movq mm1, mm7 michael@0: movq [rdi+16], mm7 michael@0: michael@0: mov rax, arg(3) ;dequant_ptr michael@0: movq mm6, [rax+16] michael@0: michael@0: pmullw mm7, mm6 michael@0: mov rax, arg(7) ;dqcoeff_ptr michael@0: michael@0: movq [rax+16], mm7 michael@0: michael@0: michael@0: ; next 8 michael@0: movq mm4, [rsi+24] michael@0: michael@0: mov rax, arg(1) ;zbin_ptr michael@0: movq mm5, [rax+24] michael@0: michael@0: movq mm7, mm4 michael@0: psraw mm4, 15 michael@0: michael@0: pxor mm7, mm4 michael@0: psubw mm7, mm4 ; abs michael@0: michael@0: movq mm6, mm7 michael@0: pcmpgtw mm5, mm6 michael@0: michael@0: pandn mm5, mm6 michael@0: movq mm7, mm5 michael@0: michael@0: movq mm5, [rdx+24] michael@0: movq mm6, [rcx+24] michael@0: michael@0: paddw mm7, mm6 michael@0: pmulhuw mm7, mm5 michael@0: michael@0: pxor mm7, mm4 michael@0: psubw mm7, mm4;gain the sign back michael@0: michael@0: mov rdi, arg(2) ;qcoeff_ptr michael@0: michael@0: movq mm1, mm7 michael@0: movq [rdi+24], mm7 michael@0: michael@0: mov rax, arg(3) ;dequant_ptr michael@0: movq mm6, [rax+24] michael@0: michael@0: pmullw mm7, mm6 michael@0: mov rax, arg(7) ;dqcoeff_ptr michael@0: michael@0: movq [rax+24], mm7 michael@0: michael@0: michael@0: michael@0: mov rdi, arg(4) ;scan_mask michael@0: mov rsi, arg(2) ;qcoeff_ptr michael@0: michael@0: pxor mm5, mm5 michael@0: pxor mm7, mm7 michael@0: michael@0: movq mm0, [rsi] michael@0: movq mm1, [rsi+8] michael@0: michael@0: movq mm2, [rdi] michael@0: movq mm3, [rdi+8]; michael@0: michael@0: pcmpeqw mm0, mm7 michael@0: pcmpeqw mm1, mm7 michael@0: michael@0: pcmpeqw mm6, mm6 michael@0: pxor mm0, mm6 michael@0: michael@0: pxor mm1, mm6 michael@0: psrlw mm0, 15 michael@0: michael@0: psrlw mm1, 15 michael@0: pmaddwd mm0, mm2 michael@0: michael@0: pmaddwd mm1, mm3 michael@0: movq mm5, mm0 michael@0: michael@0: paddd mm5, mm1 michael@0: michael@0: movq mm0, [rsi+16] michael@0: movq mm1, [rsi+24] michael@0: michael@0: movq mm2, [rdi+16] michael@0: movq mm3, [rdi+24]; michael@0: michael@0: pcmpeqw mm0, mm7 michael@0: pcmpeqw mm1, mm7 michael@0: michael@0: pcmpeqw mm6, mm6 michael@0: pxor mm0, mm6 michael@0: michael@0: pxor mm1, mm6 michael@0: psrlw mm0, 15 michael@0: michael@0: psrlw mm1, 15 michael@0: pmaddwd mm0, mm2 michael@0: michael@0: pmaddwd mm1, mm3 michael@0: paddd mm5, mm0 michael@0: michael@0: paddd mm5, mm1 michael@0: movq mm0, mm5 michael@0: michael@0: psrlq mm5, 32 michael@0: paddd mm0, mm5 michael@0: michael@0: ; eob adjustment begins here michael@0: movq rcx, mm0 michael@0: and rcx, 0xffff michael@0: michael@0: xor rdx, rdx michael@0: sub rdx, rcx ; rdx=-rcx michael@0: michael@0: bsr rax, rcx michael@0: inc rax michael@0: michael@0: sar rdx, 31 michael@0: and rax, rdx michael@0: ; Substitute the sse assembly for the old mmx mixed assembly/C. The michael@0: ; following is kept as reference michael@0: ; movq rcx, mm0 michael@0: ; bsr rax, rcx michael@0: ; michael@0: ; mov eob, rax michael@0: ; mov eee, rcx michael@0: ; michael@0: ;if(eee==0) michael@0: ;{ michael@0: ; eob=-1; michael@0: ;} michael@0: ;else if(eee<0) michael@0: ;{ michael@0: ; eob=15; michael@0: ;} michael@0: ;d->eob = eob+1; michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret