1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/quantize_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,286 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;int vp8_fast_quantize_b_impl_mmx(short *coeff_ptr, short *zbin_ptr, 1.18 +; short *qcoeff_ptr,short *dequant_ptr, 1.19 +; short *scan_mask, short *round_ptr, 1.20 +; short *quant_ptr, short *dqcoeff_ptr); 1.21 +global sym(vp8_fast_quantize_b_impl_mmx) PRIVATE 1.22 +sym(vp8_fast_quantize_b_impl_mmx): 1.23 + push rbp 1.24 + mov rbp, rsp 1.25 + SHADOW_ARGS_TO_STACK 8 1.26 + push rsi 1.27 + push rdi 1.28 + ; end prolog 1.29 + 1.30 + 1.31 + mov rsi, arg(0) ;coeff_ptr 1.32 + movq mm0, [rsi] 1.33 + 1.34 + mov rax, arg(1) ;zbin_ptr 1.35 + movq mm1, [rax] 1.36 + 1.37 + movq mm3, mm0 1.38 + psraw mm0, 15 1.39 + 1.40 + pxor mm3, mm0 1.41 + psubw mm3, mm0 ; abs 1.42 + 1.43 + movq mm2, mm3 1.44 + pcmpgtw mm1, mm2 1.45 + 1.46 + pandn mm1, mm2 1.47 + movq mm3, mm1 1.48 + 1.49 + mov rdx, arg(6) ;quant_ptr 1.50 + movq mm1, [rdx] 1.51 + 1.52 + mov rcx, arg(5) ;round_ptr 1.53 + movq mm2, [rcx] 1.54 + 1.55 + paddw mm3, mm2 1.56 + pmulhuw mm3, mm1 1.57 + 1.58 + pxor mm3, mm0 1.59 + psubw mm3, mm0 ;gain the sign back 1.60 + 1.61 + mov rdi, arg(2) ;qcoeff_ptr 1.62 + movq mm0, mm3 1.63 + 1.64 + movq [rdi], mm3 1.65 + 1.66 + mov rax, arg(3) ;dequant_ptr 1.67 + movq mm2, [rax] 1.68 + 1.69 + pmullw mm3, mm2 1.70 + mov rax, arg(7) ;dqcoeff_ptr 1.71 + 1.72 + movq [rax], mm3 1.73 + 1.74 + ; next 8 1.75 + movq mm4, [rsi+8] 1.76 + 1.77 + mov rax, arg(1) ;zbin_ptr 1.78 + movq mm5, [rax+8] 1.79 + 1.80 + movq mm7, mm4 1.81 + psraw mm4, 15 1.82 + 1.83 + pxor mm7, mm4 1.84 + psubw mm7, mm4 ; abs 1.85 + 1.86 + movq mm6, mm7 1.87 + pcmpgtw mm5, mm6 1.88 + 1.89 + pandn mm5, mm6 1.90 + movq mm7, mm5 1.91 + 1.92 + movq mm5, [rdx+8] 1.93 + movq mm6, [rcx+8] 1.94 + 1.95 + paddw mm7, mm6 1.96 + pmulhuw mm7, mm5 1.97 + 1.98 + pxor mm7, mm4 1.99 + psubw mm7, mm4;gain the sign back 1.100 + 1.101 + mov rdi, arg(2) ;qcoeff_ptr 1.102 + 1.103 + movq mm1, mm7 1.104 + movq [rdi+8], mm7 1.105 + 1.106 + mov rax, arg(3) ;dequant_ptr 1.107 + movq mm6, [rax+8] 1.108 + 1.109 + pmullw mm7, mm6 1.110 + mov rax, arg(7) ;dqcoeff_ptr 1.111 + 1.112 + movq [rax+8], mm7 1.113 + 1.114 + 1.115 + ; next 8 1.116 + movq mm4, [rsi+16] 1.117 + 1.118 + mov rax, arg(1) ;zbin_ptr 1.119 + movq mm5, [rax+16] 1.120 + 1.121 + movq mm7, mm4 1.122 + psraw mm4, 15 1.123 + 1.124 + pxor mm7, mm4 1.125 + psubw mm7, mm4 ; abs 1.126 + 1.127 + movq mm6, mm7 1.128 + pcmpgtw mm5, mm6 1.129 + 1.130 + pandn mm5, mm6 1.131 + movq mm7, mm5 1.132 + 1.133 + movq mm5, [rdx+16] 1.134 + movq mm6, [rcx+16] 1.135 + 1.136 + paddw mm7, mm6 1.137 + pmulhuw mm7, mm5 1.138 + 1.139 + pxor mm7, mm4 1.140 + psubw mm7, mm4;gain the sign back 1.141 + 1.142 + mov rdi, arg(2) ;qcoeff_ptr 1.143 + 1.144 + movq mm1, mm7 1.145 + movq [rdi+16], mm7 1.146 + 1.147 + mov rax, arg(3) ;dequant_ptr 1.148 + movq mm6, [rax+16] 1.149 + 1.150 + pmullw mm7, mm6 1.151 + mov rax, arg(7) ;dqcoeff_ptr 1.152 + 1.153 + movq [rax+16], mm7 1.154 + 1.155 + 1.156 + ; next 8 1.157 + movq mm4, [rsi+24] 1.158 + 1.159 + mov rax, arg(1) ;zbin_ptr 1.160 + movq mm5, [rax+24] 1.161 + 1.162 + movq mm7, mm4 1.163 + psraw mm4, 15 1.164 + 1.165 + pxor mm7, mm4 1.166 + psubw mm7, mm4 ; abs 1.167 + 1.168 + movq mm6, mm7 1.169 + pcmpgtw mm5, mm6 1.170 + 1.171 + pandn mm5, mm6 1.172 + movq mm7, mm5 1.173 + 1.174 + movq mm5, [rdx+24] 1.175 + movq mm6, [rcx+24] 1.176 + 1.177 + paddw mm7, mm6 1.178 + pmulhuw mm7, mm5 1.179 + 1.180 + pxor mm7, mm4 1.181 + psubw mm7, mm4;gain the sign back 1.182 + 1.183 + mov rdi, arg(2) ;qcoeff_ptr 1.184 + 1.185 + movq mm1, mm7 1.186 + movq [rdi+24], mm7 1.187 + 1.188 + mov rax, arg(3) ;dequant_ptr 1.189 + movq mm6, [rax+24] 1.190 + 1.191 + pmullw mm7, mm6 1.192 + mov rax, arg(7) ;dqcoeff_ptr 1.193 + 1.194 + movq [rax+24], mm7 1.195 + 1.196 + 1.197 + 1.198 + mov rdi, arg(4) ;scan_mask 1.199 + mov rsi, arg(2) ;qcoeff_ptr 1.200 + 1.201 + pxor mm5, mm5 1.202 + pxor mm7, mm7 1.203 + 1.204 + movq mm0, [rsi] 1.205 + movq mm1, [rsi+8] 1.206 + 1.207 + movq mm2, [rdi] 1.208 + movq mm3, [rdi+8]; 1.209 + 1.210 + pcmpeqw mm0, mm7 1.211 + pcmpeqw mm1, mm7 1.212 + 1.213 + pcmpeqw mm6, mm6 1.214 + pxor mm0, mm6 1.215 + 1.216 + pxor mm1, mm6 1.217 + psrlw mm0, 15 1.218 + 1.219 + psrlw mm1, 15 1.220 + pmaddwd mm0, mm2 1.221 + 1.222 + pmaddwd mm1, mm3 1.223 + movq mm5, mm0 1.224 + 1.225 + paddd mm5, mm1 1.226 + 1.227 + movq mm0, [rsi+16] 1.228 + movq mm1, [rsi+24] 1.229 + 1.230 + movq mm2, [rdi+16] 1.231 + movq mm3, [rdi+24]; 1.232 + 1.233 + pcmpeqw mm0, mm7 1.234 + pcmpeqw mm1, mm7 1.235 + 1.236 + pcmpeqw mm6, mm6 1.237 + pxor mm0, mm6 1.238 + 1.239 + pxor mm1, mm6 1.240 + psrlw mm0, 15 1.241 + 1.242 + psrlw mm1, 15 1.243 + pmaddwd mm0, mm2 1.244 + 1.245 + pmaddwd mm1, mm3 1.246 + paddd mm5, mm0 1.247 + 1.248 + paddd mm5, mm1 1.249 + movq mm0, mm5 1.250 + 1.251 + psrlq mm5, 32 1.252 + paddd mm0, mm5 1.253 + 1.254 + ; eob adjustment begins here 1.255 + movq rcx, mm0 1.256 + and rcx, 0xffff 1.257 + 1.258 + xor rdx, rdx 1.259 + sub rdx, rcx ; rdx=-rcx 1.260 + 1.261 + bsr rax, rcx 1.262 + inc rax 1.263 + 1.264 + sar rdx, 31 1.265 + and rax, rdx 1.266 + ; Substitute the sse assembly for the old mmx mixed assembly/C. The 1.267 + ; following is kept as reference 1.268 + ; movq rcx, mm0 1.269 + ; bsr rax, rcx 1.270 + ; 1.271 + ; mov eob, rax 1.272 + ; mov eee, rcx 1.273 + ; 1.274 + ;if(eee==0) 1.275 + ;{ 1.276 + ; eob=-1; 1.277 + ;} 1.278 + ;else if(eee<0) 1.279 + ;{ 1.280 + ; eob=15; 1.281 + ;} 1.282 + ;d->eob = eob+1; 1.283 + 1.284 + ; begin epilog 1.285 + pop rdi 1.286 + pop rsi 1.287 + UNSHADOW_ARGS 1.288 + pop rbp 1.289 + ret