1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/encodeopt.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,386 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;int vp8_block_error_xmm(short *coeff_ptr, short *dcoef_ptr) 1.18 +global sym(vp8_block_error_xmm) PRIVATE 1.19 +sym(vp8_block_error_xmm): 1.20 + push rbp 1.21 + mov rbp, rsp 1.22 + SHADOW_ARGS_TO_STACK 2 1.23 + push rsi 1.24 + push rdi 1.25 + ; end prologue 1.26 + 1.27 + mov rsi, arg(0) ;coeff_ptr 1.28 + mov rdi, arg(1) ;dcoef_ptr 1.29 + 1.30 + movdqa xmm0, [rsi] 1.31 + movdqa xmm1, [rdi] 1.32 + 1.33 + movdqa xmm2, [rsi+16] 1.34 + movdqa xmm3, [rdi+16] 1.35 + 1.36 + psubw xmm0, xmm1 1.37 + psubw xmm2, xmm3 1.38 + 1.39 + pmaddwd xmm0, xmm0 1.40 + pmaddwd xmm2, xmm2 1.41 + 1.42 + paddd xmm0, xmm2 1.43 + 1.44 + pxor xmm5, xmm5 1.45 + movdqa xmm1, xmm0 1.46 + 1.47 + punpckldq xmm0, xmm5 1.48 + punpckhdq xmm1, xmm5 1.49 + 1.50 + paddd xmm0, xmm1 1.51 + movdqa xmm1, xmm0 1.52 + 1.53 + psrldq xmm0, 8 1.54 + paddd xmm0, xmm1 1.55 + 1.56 + movq rax, xmm0 1.57 + 1.58 + pop rdi 1.59 + pop rsi 1.60 + ; begin epilog 1.61 + UNSHADOW_ARGS 1.62 + pop rbp 1.63 + ret 1.64 + 1.65 +;int vp8_block_error_mmx(short *coeff_ptr, short *dcoef_ptr) 1.66 +global sym(vp8_block_error_mmx) PRIVATE 1.67 +sym(vp8_block_error_mmx): 1.68 + push rbp 1.69 + mov rbp, rsp 1.70 + SHADOW_ARGS_TO_STACK 2 1.71 + push rsi 1.72 + push rdi 1.73 + ; end prolog 1.74 + 1.75 + 1.76 + mov rsi, arg(0) ;coeff_ptr 1.77 + pxor mm7, mm7 1.78 + 1.79 + mov rdi, arg(1) ;dcoef_ptr 1.80 + movq mm3, [rsi] 1.81 + 1.82 + movq mm4, [rdi] 1.83 + movq mm5, [rsi+8] 1.84 + 1.85 + movq mm6, [rdi+8] 1.86 + pxor mm1, mm1 ; from movd mm1, dc ; dc =0 1.87 + 1.88 + movq mm2, mm7 1.89 + psubw mm5, mm6 1.90 + 1.91 + por mm1, mm2 1.92 + pmaddwd mm5, mm5 1.93 + 1.94 + pcmpeqw mm1, mm7 1.95 + psubw mm3, mm4 1.96 + 1.97 + pand mm1, mm3 1.98 + pmaddwd mm1, mm1 1.99 + 1.100 + paddd mm1, mm5 1.101 + movq mm3, [rsi+16] 1.102 + 1.103 + movq mm4, [rdi+16] 1.104 + movq mm5, [rsi+24] 1.105 + 1.106 + movq mm6, [rdi+24] 1.107 + psubw mm5, mm6 1.108 + 1.109 + pmaddwd mm5, mm5 1.110 + psubw mm3, mm4 1.111 + 1.112 + pmaddwd mm3, mm3 1.113 + paddd mm3, mm5 1.114 + 1.115 + paddd mm1, mm3 1.116 + movq mm0, mm1 1.117 + 1.118 + psrlq mm1, 32 1.119 + paddd mm0, mm1 1.120 + 1.121 + movq rax, mm0 1.122 + 1.123 + pop rdi 1.124 + pop rsi 1.125 + ; begin epilog 1.126 + UNSHADOW_ARGS 1.127 + pop rbp 1.128 + ret 1.129 + 1.130 + 1.131 +;int vp8_mbblock_error_mmx_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 1.132 +global sym(vp8_mbblock_error_mmx_impl) PRIVATE 1.133 +sym(vp8_mbblock_error_mmx_impl): 1.134 + push rbp 1.135 + mov rbp, rsp 1.136 + SHADOW_ARGS_TO_STACK 3 1.137 + push rsi 1.138 + push rdi 1.139 + ; end prolog 1.140 + 1.141 + 1.142 + mov rsi, arg(0) ;coeff_ptr 1.143 + pxor mm7, mm7 1.144 + 1.145 + mov rdi, arg(1) ;dcoef_ptr 1.146 + pxor mm2, mm2 1.147 + 1.148 + movd mm1, dword ptr arg(2) ;dc 1.149 + por mm1, mm2 1.150 + 1.151 + pcmpeqw mm1, mm7 1.152 + mov rcx, 16 1.153 + 1.154 +.mberror_loop_mmx: 1.155 + movq mm3, [rsi] 1.156 + movq mm4, [rdi] 1.157 + 1.158 + movq mm5, [rsi+8] 1.159 + movq mm6, [rdi+8] 1.160 + 1.161 + 1.162 + psubw mm5, mm6 1.163 + pmaddwd mm5, mm5 1.164 + 1.165 + psubw mm3, mm4 1.166 + pand mm3, mm1 1.167 + 1.168 + pmaddwd mm3, mm3 1.169 + paddd mm2, mm5 1.170 + 1.171 + paddd mm2, mm3 1.172 + movq mm3, [rsi+16] 1.173 + 1.174 + movq mm4, [rdi+16] 1.175 + movq mm5, [rsi+24] 1.176 + 1.177 + movq mm6, [rdi+24] 1.178 + psubw mm5, mm6 1.179 + 1.180 + pmaddwd mm5, mm5 1.181 + psubw mm3, mm4 1.182 + 1.183 + pmaddwd mm3, mm3 1.184 + paddd mm2, mm5 1.185 + 1.186 + paddd mm2, mm3 1.187 + add rsi, 32 1.188 + 1.189 + add rdi, 32 1.190 + sub rcx, 1 1.191 + 1.192 + jnz .mberror_loop_mmx 1.193 + 1.194 + movq mm0, mm2 1.195 + psrlq mm2, 32 1.196 + 1.197 + paddd mm0, mm2 1.198 + movq rax, mm0 1.199 + 1.200 + pop rdi 1.201 + pop rsi 1.202 + ; begin epilog 1.203 + UNSHADOW_ARGS 1.204 + pop rbp 1.205 + ret 1.206 + 1.207 + 1.208 +;int vp8_mbblock_error_xmm_impl(short *coeff_ptr, short *dcoef_ptr, int dc); 1.209 +global sym(vp8_mbblock_error_xmm_impl) PRIVATE 1.210 +sym(vp8_mbblock_error_xmm_impl): 1.211 + push rbp 1.212 + mov rbp, rsp 1.213 + SHADOW_ARGS_TO_STACK 3 1.214 + SAVE_XMM 6 1.215 + push rsi 1.216 + push rdi 1.217 + ; end prolog 1.218 + 1.219 + 1.220 + mov rsi, arg(0) ;coeff_ptr 1.221 + pxor xmm6, xmm6 1.222 + 1.223 + mov rdi, arg(1) ;dcoef_ptr 1.224 + pxor xmm4, xmm4 1.225 + 1.226 + movd xmm5, dword ptr arg(2) ;dc 1.227 + por xmm5, xmm4 1.228 + 1.229 + pcmpeqw xmm5, xmm6 1.230 + mov rcx, 16 1.231 + 1.232 +.mberror_loop: 1.233 + movdqa xmm0, [rsi] 1.234 + movdqa xmm1, [rdi] 1.235 + 1.236 + movdqa xmm2, [rsi+16] 1.237 + movdqa xmm3, [rdi+16] 1.238 + 1.239 + 1.240 + psubw xmm2, xmm3 1.241 + pmaddwd xmm2, xmm2 1.242 + 1.243 + psubw xmm0, xmm1 1.244 + pand xmm0, xmm5 1.245 + 1.246 + pmaddwd xmm0, xmm0 1.247 + add rsi, 32 1.248 + 1.249 + add rdi, 32 1.250 + 1.251 + sub rcx, 1 1.252 + paddd xmm4, xmm2 1.253 + 1.254 + paddd xmm4, xmm0 1.255 + jnz .mberror_loop 1.256 + 1.257 + movdqa xmm0, xmm4 1.258 + punpckldq xmm0, xmm6 1.259 + 1.260 + punpckhdq xmm4, xmm6 1.261 + paddd xmm0, xmm4 1.262 + 1.263 + movdqa xmm1, xmm0 1.264 + psrldq xmm0, 8 1.265 + 1.266 + paddd xmm0, xmm1 1.267 + movq rax, xmm0 1.268 + 1.269 + pop rdi 1.270 + pop rsi 1.271 + ; begin epilog 1.272 + RESTORE_XMM 1.273 + UNSHADOW_ARGS 1.274 + pop rbp 1.275 + ret 1.276 + 1.277 + 1.278 +;int vp8_mbuverror_mmx_impl(short *s_ptr, short *d_ptr); 1.279 +global sym(vp8_mbuverror_mmx_impl) PRIVATE 1.280 +sym(vp8_mbuverror_mmx_impl): 1.281 + push rbp 1.282 + mov rbp, rsp 1.283 + SHADOW_ARGS_TO_STACK 2 1.284 + push rsi 1.285 + push rdi 1.286 + ; end prolog 1.287 + 1.288 + 1.289 + mov rsi, arg(0) ;s_ptr 1.290 + mov rdi, arg(1) ;d_ptr 1.291 + 1.292 + mov rcx, 16 1.293 + pxor mm7, mm7 1.294 + 1.295 +.mbuverror_loop_mmx: 1.296 + 1.297 + movq mm1, [rsi] 1.298 + movq mm2, [rdi] 1.299 + 1.300 + psubw mm1, mm2 1.301 + pmaddwd mm1, mm1 1.302 + 1.303 + 1.304 + movq mm3, [rsi+8] 1.305 + movq mm4, [rdi+8] 1.306 + 1.307 + psubw mm3, mm4 1.308 + pmaddwd mm3, mm3 1.309 + 1.310 + 1.311 + paddd mm7, mm1 1.312 + paddd mm7, mm3 1.313 + 1.314 + 1.315 + add rsi, 16 1.316 + add rdi, 16 1.317 + 1.318 + dec rcx 1.319 + jnz .mbuverror_loop_mmx 1.320 + 1.321 + movq mm0, mm7 1.322 + psrlq mm7, 32 1.323 + 1.324 + paddd mm0, mm7 1.325 + movq rax, mm0 1.326 + 1.327 + pop rdi 1.328 + pop rsi 1.329 + ; begin epilog 1.330 + UNSHADOW_ARGS 1.331 + pop rbp 1.332 + ret 1.333 + 1.334 + 1.335 +;int vp8_mbuverror_xmm_impl(short *s_ptr, short *d_ptr); 1.336 +global sym(vp8_mbuverror_xmm_impl) PRIVATE 1.337 +sym(vp8_mbuverror_xmm_impl): 1.338 + push rbp 1.339 + mov rbp, rsp 1.340 + SHADOW_ARGS_TO_STACK 2 1.341 + push rsi 1.342 + push rdi 1.343 + ; end prolog 1.344 + 1.345 + 1.346 + mov rsi, arg(0) ;s_ptr 1.347 + mov rdi, arg(1) ;d_ptr 1.348 + 1.349 + mov rcx, 16 1.350 + pxor xmm3, xmm3 1.351 + 1.352 +.mbuverror_loop: 1.353 + 1.354 + movdqa xmm1, [rsi] 1.355 + movdqa xmm2, [rdi] 1.356 + 1.357 + psubw xmm1, xmm2 1.358 + pmaddwd xmm1, xmm1 1.359 + 1.360 + paddd xmm3, xmm1 1.361 + 1.362 + add rsi, 16 1.363 + add rdi, 16 1.364 + 1.365 + dec rcx 1.366 + jnz .mbuverror_loop 1.367 + 1.368 + pxor xmm0, xmm0 1.369 + movdqa xmm1, xmm3 1.370 + 1.371 + movdqa xmm2, xmm1 1.372 + punpckldq xmm1, xmm0 1.373 + 1.374 + punpckhdq xmm2, xmm0 1.375 + paddd xmm1, xmm2 1.376 + 1.377 + movdqa xmm2, xmm1 1.378 + 1.379 + psrldq xmm1, 8 1.380 + paddd xmm1, xmm2 1.381 + 1.382 + movq rax, xmm1 1.383 + 1.384 + pop rdi 1.385 + pop rsi 1.386 + ; begin epilog 1.387 + UNSHADOW_ARGS 1.388 + pop rbp 1.389 + ret