michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: %include "third_party/x86inc/x86inc.asm" michael@0: michael@0: SECTION_RODATA michael@0: pw_1: times 8 dw 1 michael@0: michael@0: SECTION .text michael@0: michael@0: %macro QUANTIZE_FN 2 michael@0: cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ michael@0: shift, qcoeff, dqcoeff, dequant, zbin_oq, \ michael@0: eob, scan, iscan michael@0: cmp dword skipm, 0 michael@0: jne .blank michael@0: michael@0: ; actual quantize loop - setup pointers, rounders, etc. michael@0: movifnidn coeffq, coeffmp michael@0: movifnidn ncoeffq, ncoeffmp michael@0: mov r2, dequantmp michael@0: movifnidn zbinq, zbinmp michael@0: movifnidn roundq, roundmp michael@0: movifnidn quantq, quantmp michael@0: movd m4, dword zbin_oqm ; m4 = zbin_oq michael@0: mova m0, [zbinq] ; m0 = zbin michael@0: punpcklwd m4, m4 michael@0: mova m1, [roundq] ; m1 = round michael@0: pshufd m4, m4, 0 michael@0: mova m2, [quantq] ; m2 = quant michael@0: paddw m0, m4 ; m0 = zbin + zbin_oq michael@0: %ifidn %1, b_32x32 michael@0: pcmpeqw m5, m5 michael@0: psrlw m5, 15 michael@0: paddw m0, m5 michael@0: paddw m1, m5 michael@0: psrlw m0, 1 ; m0 = (m0 + 1) / 2 michael@0: psrlw m1, 1 ; m1 = (m1 + 1) / 2 michael@0: %endif michael@0: mova m3, [r2q] ; m3 = dequant michael@0: psubw m0, [pw_1] michael@0: mov r2, shiftmp michael@0: mov r3, qcoeffmp michael@0: mova m4, [r2] ; m4 = shift michael@0: mov r4, dqcoeffmp michael@0: mov r5, iscanmp michael@0: %ifidn %1, b_32x32 michael@0: psllw m4, 1 michael@0: %endif michael@0: pxor m5, m5 ; m5 = dedicated zero michael@0: DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob michael@0: lea coeffq, [ coeffq+ncoeffq*2] michael@0: lea iscanq, [ iscanq+ncoeffq*2] michael@0: lea qcoeffq, [ qcoeffq+ncoeffq*2] michael@0: lea dqcoeffq, [dqcoeffq+ncoeffq*2] michael@0: neg ncoeffq michael@0: michael@0: ; get DC and first 15 AC coeffs michael@0: mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] michael@0: mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] michael@0: pabsw m6, m9 ; m6 = abs(m9) michael@0: pabsw m11, m10 ; m11 = abs(m10) michael@0: pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin michael@0: punpckhqdq m0, m0 michael@0: pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin michael@0: paddsw m6, m1 ; m6 += round michael@0: punpckhqdq m1, m1 michael@0: paddsw m11, m1 ; m11 += round michael@0: pmulhw m8, m6, m2 ; m8 = m6*q>>16 michael@0: punpckhqdq m2, m2 michael@0: pmulhw m13, m11, m2 ; m13 = m11*q>>16 michael@0: paddw m8, m6 ; m8 += m6 michael@0: paddw m13, m11 ; m13 += m11 michael@0: pmulhw m8, m4 ; m8 = m8*qsh>>16 michael@0: punpckhqdq m4, m4 michael@0: pmulhw m13, m4 ; m13 = m13*qsh>>16 michael@0: psignw m8, m9 ; m8 = reinsert sign michael@0: psignw m13, m10 ; m13 = reinsert sign michael@0: pand m8, m7 michael@0: pand m13, m12 michael@0: mova [qcoeffq+ncoeffq*2+ 0], m8 michael@0: mova [qcoeffq+ncoeffq*2+16], m13 michael@0: %ifidn %1, b_32x32 michael@0: pabsw m8, m8 michael@0: pabsw m13, m13 michael@0: %endif michael@0: pmullw m8, m3 ; dqc[i] = qc[i] * q michael@0: punpckhqdq m3, m3 michael@0: pmullw m13, m3 ; dqc[i] = qc[i] * q michael@0: %ifidn %1, b_32x32 michael@0: psrlw m8, 1 michael@0: psrlw m13, 1 michael@0: psignw m8, m9 michael@0: psignw m13, m10 michael@0: %endif michael@0: mova [dqcoeffq+ncoeffq*2+ 0], m8 michael@0: mova [dqcoeffq+ncoeffq*2+16], m13 michael@0: pcmpeqw m8, m5 ; m8 = c[i] == 0 michael@0: pcmpeqw m13, m5 ; m13 = c[i] == 0 michael@0: mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] michael@0: mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] michael@0: psubw m6, m7 ; m6 = scan[i] + 1 michael@0: psubw m11, m12 ; m11 = scan[i] + 1 michael@0: pandn m8, m6 ; m8 = max(eob) michael@0: pandn m13, m11 ; m13 = max(eob) michael@0: pmaxsw m8, m13 michael@0: add ncoeffq, mmsize michael@0: jz .accumulate_eob michael@0: michael@0: .ac_only_loop: michael@0: mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] michael@0: mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] michael@0: pabsw m6, m9 ; m6 = abs(m9) michael@0: pabsw m11, m10 ; m11 = abs(m10) michael@0: pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin michael@0: pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin michael@0: %ifidn %1, b_32x32 michael@0: pmovmskb r6, m7 michael@0: pmovmskb r2, m12 michael@0: or r6, r2 michael@0: jz .skip_iter michael@0: %endif michael@0: paddsw m6, m1 ; m6 += round michael@0: paddsw m11, m1 ; m11 += round michael@0: pmulhw m14, m6, m2 ; m14 = m6*q>>16 michael@0: pmulhw m13, m11, m2 ; m13 = m11*q>>16 michael@0: paddw m14, m6 ; m14 += m6 michael@0: paddw m13, m11 ; m13 += m11 michael@0: pmulhw m14, m4 ; m14 = m14*qsh>>16 michael@0: pmulhw m13, m4 ; m13 = m13*qsh>>16 michael@0: psignw m14, m9 ; m14 = reinsert sign michael@0: psignw m13, m10 ; m13 = reinsert sign michael@0: pand m14, m7 michael@0: pand m13, m12 michael@0: mova [qcoeffq+ncoeffq*2+ 0], m14 michael@0: mova [qcoeffq+ncoeffq*2+16], m13 michael@0: %ifidn %1, b_32x32 michael@0: pabsw m14, m14 michael@0: pabsw m13, m13 michael@0: %endif michael@0: pmullw m14, m3 ; dqc[i] = qc[i] * q michael@0: pmullw m13, m3 ; dqc[i] = qc[i] * q michael@0: %ifidn %1, b_32x32 michael@0: psrlw m14, 1 michael@0: psrlw m13, 1 michael@0: psignw m14, m9 michael@0: psignw m13, m10 michael@0: %endif michael@0: mova [dqcoeffq+ncoeffq*2+ 0], m14 michael@0: mova [dqcoeffq+ncoeffq*2+16], m13 michael@0: pcmpeqw m14, m5 ; m14 = c[i] == 0 michael@0: pcmpeqw m13, m5 ; m13 = c[i] == 0 michael@0: mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] michael@0: mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] michael@0: psubw m6, m7 ; m6 = scan[i] + 1 michael@0: psubw m11, m12 ; m11 = scan[i] + 1 michael@0: pandn m14, m6 ; m14 = max(eob) michael@0: pandn m13, m11 ; m13 = max(eob) michael@0: pmaxsw m8, m14 michael@0: pmaxsw m8, m13 michael@0: add ncoeffq, mmsize michael@0: jl .ac_only_loop michael@0: michael@0: %ifidn %1, b_32x32 michael@0: jmp .accumulate_eob michael@0: .skip_iter: michael@0: mova [qcoeffq+ncoeffq*2+ 0], m5 michael@0: mova [qcoeffq+ncoeffq*2+16], m5 michael@0: mova [dqcoeffq+ncoeffq*2+ 0], m5 michael@0: mova [dqcoeffq+ncoeffq*2+16], m5 michael@0: add ncoeffq, mmsize michael@0: jl .ac_only_loop michael@0: %endif michael@0: michael@0: .accumulate_eob: michael@0: ; horizontally accumulate/max eobs and write into [eob] memory pointer michael@0: mov r2, eobmp michael@0: pshufd m7, m8, 0xe michael@0: pmaxsw m8, m7 michael@0: pshuflw m7, m8, 0xe michael@0: pmaxsw m8, m7 michael@0: pshuflw m7, m8, 0x1 michael@0: pmaxsw m8, m7 michael@0: pextrw [r2], m8, 0 michael@0: RET michael@0: michael@0: ; skip-block, i.e. just write all zeroes michael@0: .blank: michael@0: mov r0, dqcoeffmp michael@0: movifnidn ncoeffq, ncoeffmp michael@0: mov r2, qcoeffmp michael@0: mov r3, eobmp michael@0: DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob michael@0: lea dqcoeffq, [dqcoeffq+ncoeffq*2] michael@0: lea qcoeffq, [ qcoeffq+ncoeffq*2] michael@0: neg ncoeffq michael@0: pxor m7, m7 michael@0: .blank_loop: michael@0: mova [dqcoeffq+ncoeffq*2+ 0], m7 michael@0: mova [dqcoeffq+ncoeffq*2+16], m7 michael@0: mova [qcoeffq+ncoeffq*2+ 0], m7 michael@0: mova [qcoeffq+ncoeffq*2+16], m7 michael@0: add ncoeffq, mmsize michael@0: jl .blank_loop michael@0: mov word [eobq], 0 michael@0: RET michael@0: %endmacro michael@0: michael@0: INIT_XMM ssse3 michael@0: QUANTIZE_FN b, 6 michael@0: QUANTIZE_FN b_32x32, 7