1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_quantize_ssse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,218 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "third_party/x86inc/x86inc.asm" 1.15 + 1.16 +SECTION_RODATA 1.17 +pw_1: times 8 dw 1 1.18 + 1.19 +SECTION .text 1.20 + 1.21 +%macro QUANTIZE_FN 2 1.22 +cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, zbin, round, quant, \ 1.23 + shift, qcoeff, dqcoeff, dequant, zbin_oq, \ 1.24 + eob, scan, iscan 1.25 + cmp dword skipm, 0 1.26 + jne .blank 1.27 + 1.28 + ; actual quantize loop - setup pointers, rounders, etc. 1.29 + movifnidn coeffq, coeffmp 1.30 + movifnidn ncoeffq, ncoeffmp 1.31 + mov r2, dequantmp 1.32 + movifnidn zbinq, zbinmp 1.33 + movifnidn roundq, roundmp 1.34 + movifnidn quantq, quantmp 1.35 + movd m4, dword zbin_oqm ; m4 = zbin_oq 1.36 + mova m0, [zbinq] ; m0 = zbin 1.37 + punpcklwd m4, m4 1.38 + mova m1, [roundq] ; m1 = round 1.39 + pshufd m4, m4, 0 1.40 + mova m2, [quantq] ; m2 = quant 1.41 + paddw m0, m4 ; m0 = zbin + zbin_oq 1.42 +%ifidn %1, b_32x32 1.43 + pcmpeqw m5, m5 1.44 + psrlw m5, 15 1.45 + paddw m0, m5 1.46 + paddw m1, m5 1.47 + psrlw m0, 1 ; m0 = (m0 + 1) / 2 1.48 + psrlw m1, 1 ; m1 = (m1 + 1) / 2 1.49 +%endif 1.50 + mova m3, [r2q] ; m3 = dequant 1.51 + psubw m0, [pw_1] 1.52 + mov r2, shiftmp 1.53 + mov r3, qcoeffmp 1.54 + mova m4, [r2] ; m4 = shift 1.55 + mov r4, dqcoeffmp 1.56 + mov r5, iscanmp 1.57 +%ifidn %1, b_32x32 1.58 + psllw m4, 1 1.59 +%endif 1.60 + pxor m5, m5 ; m5 = dedicated zero 1.61 + DEFINE_ARGS coeff, ncoeff, d1, qcoeff, dqcoeff, iscan, d2, d3, d4, d5, d6, eob 1.62 + lea coeffq, [ coeffq+ncoeffq*2] 1.63 + lea iscanq, [ iscanq+ncoeffq*2] 1.64 + lea qcoeffq, [ qcoeffq+ncoeffq*2] 1.65 + lea dqcoeffq, [dqcoeffq+ncoeffq*2] 1.66 + neg ncoeffq 1.67 + 1.68 + ; get DC and first 15 AC coeffs 1.69 + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 1.70 + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 1.71 + pabsw m6, m9 ; m6 = abs(m9) 1.72 + pabsw m11, m10 ; m11 = abs(m10) 1.73 + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 1.74 + punpckhqdq m0, m0 1.75 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 1.76 + paddsw m6, m1 ; m6 += round 1.77 + punpckhqdq m1, m1 1.78 + paddsw m11, m1 ; m11 += round 1.79 + pmulhw m8, m6, m2 ; m8 = m6*q>>16 1.80 + punpckhqdq m2, m2 1.81 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 1.82 + paddw m8, m6 ; m8 += m6 1.83 + paddw m13, m11 ; m13 += m11 1.84 + pmulhw m8, m4 ; m8 = m8*qsh>>16 1.85 + punpckhqdq m4, m4 1.86 + pmulhw m13, m4 ; m13 = m13*qsh>>16 1.87 + psignw m8, m9 ; m8 = reinsert sign 1.88 + psignw m13, m10 ; m13 = reinsert sign 1.89 + pand m8, m7 1.90 + pand m13, m12 1.91 + mova [qcoeffq+ncoeffq*2+ 0], m8 1.92 + mova [qcoeffq+ncoeffq*2+16], m13 1.93 +%ifidn %1, b_32x32 1.94 + pabsw m8, m8 1.95 + pabsw m13, m13 1.96 +%endif 1.97 + pmullw m8, m3 ; dqc[i] = qc[i] * q 1.98 + punpckhqdq m3, m3 1.99 + pmullw m13, m3 ; dqc[i] = qc[i] * q 1.100 +%ifidn %1, b_32x32 1.101 + psrlw m8, 1 1.102 + psrlw m13, 1 1.103 + psignw m8, m9 1.104 + psignw m13, m10 1.105 +%endif 1.106 + mova [dqcoeffq+ncoeffq*2+ 0], m8 1.107 + mova [dqcoeffq+ncoeffq*2+16], m13 1.108 + pcmpeqw m8, m5 ; m8 = c[i] == 0 1.109 + pcmpeqw m13, m5 ; m13 = c[i] == 0 1.110 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 1.111 + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 1.112 + psubw m6, m7 ; m6 = scan[i] + 1 1.113 + psubw m11, m12 ; m11 = scan[i] + 1 1.114 + pandn m8, m6 ; m8 = max(eob) 1.115 + pandn m13, m11 ; m13 = max(eob) 1.116 + pmaxsw m8, m13 1.117 + add ncoeffq, mmsize 1.118 + jz .accumulate_eob 1.119 + 1.120 +.ac_only_loop: 1.121 + mova m9, [ coeffq+ncoeffq*2+ 0] ; m9 = c[i] 1.122 + mova m10, [ coeffq+ncoeffq*2+16] ; m10 = c[i] 1.123 + pabsw m6, m9 ; m6 = abs(m9) 1.124 + pabsw m11, m10 ; m11 = abs(m10) 1.125 + pcmpgtw m7, m6, m0 ; m7 = c[i] >= zbin 1.126 + pcmpgtw m12, m11, m0 ; m12 = c[i] >= zbin 1.127 +%ifidn %1, b_32x32 1.128 + pmovmskb r6, m7 1.129 + pmovmskb r2, m12 1.130 + or r6, r2 1.131 + jz .skip_iter 1.132 +%endif 1.133 + paddsw m6, m1 ; m6 += round 1.134 + paddsw m11, m1 ; m11 += round 1.135 + pmulhw m14, m6, m2 ; m14 = m6*q>>16 1.136 + pmulhw m13, m11, m2 ; m13 = m11*q>>16 1.137 + paddw m14, m6 ; m14 += m6 1.138 + paddw m13, m11 ; m13 += m11 1.139 + pmulhw m14, m4 ; m14 = m14*qsh>>16 1.140 + pmulhw m13, m4 ; m13 = m13*qsh>>16 1.141 + psignw m14, m9 ; m14 = reinsert sign 1.142 + psignw m13, m10 ; m13 = reinsert sign 1.143 + pand m14, m7 1.144 + pand m13, m12 1.145 + mova [qcoeffq+ncoeffq*2+ 0], m14 1.146 + mova [qcoeffq+ncoeffq*2+16], m13 1.147 +%ifidn %1, b_32x32 1.148 + pabsw m14, m14 1.149 + pabsw m13, m13 1.150 +%endif 1.151 + pmullw m14, m3 ; dqc[i] = qc[i] * q 1.152 + pmullw m13, m3 ; dqc[i] = qc[i] * q 1.153 +%ifidn %1, b_32x32 1.154 + psrlw m14, 1 1.155 + psrlw m13, 1 1.156 + psignw m14, m9 1.157 + psignw m13, m10 1.158 +%endif 1.159 + mova [dqcoeffq+ncoeffq*2+ 0], m14 1.160 + mova [dqcoeffq+ncoeffq*2+16], m13 1.161 + pcmpeqw m14, m5 ; m14 = c[i] == 0 1.162 + pcmpeqw m13, m5 ; m13 = c[i] == 0 1.163 + mova m6, [ iscanq+ncoeffq*2+ 0] ; m6 = scan[i] 1.164 + mova m11, [ iscanq+ncoeffq*2+16] ; m11 = scan[i] 1.165 + psubw m6, m7 ; m6 = scan[i] + 1 1.166 + psubw m11, m12 ; m11 = scan[i] + 1 1.167 + pandn m14, m6 ; m14 = max(eob) 1.168 + pandn m13, m11 ; m13 = max(eob) 1.169 + pmaxsw m8, m14 1.170 + pmaxsw m8, m13 1.171 + add ncoeffq, mmsize 1.172 + jl .ac_only_loop 1.173 + 1.174 +%ifidn %1, b_32x32 1.175 + jmp .accumulate_eob 1.176 +.skip_iter: 1.177 + mova [qcoeffq+ncoeffq*2+ 0], m5 1.178 + mova [qcoeffq+ncoeffq*2+16], m5 1.179 + mova [dqcoeffq+ncoeffq*2+ 0], m5 1.180 + mova [dqcoeffq+ncoeffq*2+16], m5 1.181 + add ncoeffq, mmsize 1.182 + jl .ac_only_loop 1.183 +%endif 1.184 + 1.185 +.accumulate_eob: 1.186 + ; horizontally accumulate/max eobs and write into [eob] memory pointer 1.187 + mov r2, eobmp 1.188 + pshufd m7, m8, 0xe 1.189 + pmaxsw m8, m7 1.190 + pshuflw m7, m8, 0xe 1.191 + pmaxsw m8, m7 1.192 + pshuflw m7, m8, 0x1 1.193 + pmaxsw m8, m7 1.194 + pextrw [r2], m8, 0 1.195 + RET 1.196 + 1.197 + ; skip-block, i.e. just write all zeroes 1.198 +.blank: 1.199 + mov r0, dqcoeffmp 1.200 + movifnidn ncoeffq, ncoeffmp 1.201 + mov r2, qcoeffmp 1.202 + mov r3, eobmp 1.203 + DEFINE_ARGS dqcoeff, ncoeff, qcoeff, eob 1.204 + lea dqcoeffq, [dqcoeffq+ncoeffq*2] 1.205 + lea qcoeffq, [ qcoeffq+ncoeffq*2] 1.206 + neg ncoeffq 1.207 + pxor m7, m7 1.208 +.blank_loop: 1.209 + mova [dqcoeffq+ncoeffq*2+ 0], m7 1.210 + mova [dqcoeffq+ncoeffq*2+16], m7 1.211 + mova [qcoeffq+ncoeffq*2+ 0], m7 1.212 + mova [qcoeffq+ncoeffq*2+16], m7 1.213 + add ncoeffq, mmsize 1.214 + jl .blank_loop 1.215 + mov word [eobq], 0 1.216 + RET 1.217 +%endmacro 1.218 + 1.219 +INIT_XMM ssse3 1.220 +QUANTIZE_FN b, 6 1.221 +QUANTIZE_FN b_32x32, 7