1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/variance_impl_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1359 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%define xmm_filter_shift 7 1.18 + 1.19 +;unsigned int vp8_get_mb_ss_sse2 1.20 +;( 1.21 +; short *src_ptr 1.22 +;) 1.23 +global sym(vp8_get_mb_ss_sse2) PRIVATE 1.24 +sym(vp8_get_mb_ss_sse2): 1.25 + push rbp 1.26 + mov rbp, rsp 1.27 + SHADOW_ARGS_TO_STACK 1 1.28 + GET_GOT rbx 1.29 + push rsi 1.30 + push rdi 1.31 + sub rsp, 16 1.32 + ; end prolog 1.33 + 1.34 + 1.35 + mov rax, arg(0) ;[src_ptr] 1.36 + mov rcx, 8 1.37 + pxor xmm4, xmm4 1.38 + 1.39 +.NEXTROW: 1.40 + movdqa xmm0, [rax] 1.41 + movdqa xmm1, [rax+16] 1.42 + movdqa xmm2, [rax+32] 1.43 + movdqa xmm3, [rax+48] 1.44 + pmaddwd xmm0, xmm0 1.45 + pmaddwd xmm1, xmm1 1.46 + pmaddwd xmm2, xmm2 1.47 + pmaddwd xmm3, xmm3 1.48 + 1.49 + paddd xmm0, xmm1 1.50 + paddd xmm2, xmm3 1.51 + paddd xmm4, xmm0 1.52 + paddd xmm4, xmm2 1.53 + 1.54 + add rax, 0x40 1.55 + dec rcx 1.56 + ja .NEXTROW 1.57 + 1.58 + movdqa xmm3,xmm4 1.59 + psrldq xmm4,8 1.60 + paddd xmm4,xmm3 1.61 + movdqa xmm3,xmm4 1.62 + psrldq xmm4,4 1.63 + paddd xmm4,xmm3 1.64 + movq rax,xmm4 1.65 + 1.66 + 1.67 + ; begin epilog 1.68 + add rsp, 16 1.69 + pop rdi 1.70 + pop rsi 1.71 + RESTORE_GOT 1.72 + UNSHADOW_ARGS 1.73 + pop rbp 1.74 + ret 1.75 + 1.76 + 1.77 +;unsigned int vp8_get16x16var_sse2 1.78 +;( 1.79 +; unsigned char * src_ptr, 1.80 +; int source_stride, 1.81 +; unsigned char * ref_ptr, 1.82 +; int recon_stride, 1.83 +; unsigned int * SSE, 1.84 +; int * Sum 1.85 +;) 1.86 +global sym(vp8_get16x16var_sse2) PRIVATE 1.87 +sym(vp8_get16x16var_sse2): 1.88 + push rbp 1.89 + mov rbp, rsp 1.90 + SHADOW_ARGS_TO_STACK 6 1.91 + SAVE_XMM 7 1.92 + push rbx 1.93 + push rsi 1.94 + push rdi 1.95 + ; end prolog 1.96 + 1.97 + mov rsi, arg(0) ;[src_ptr] 1.98 + mov rdi, arg(2) ;[ref_ptr] 1.99 + 1.100 + movsxd rax, DWORD PTR arg(1) ;[source_stride] 1.101 + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 1.102 + 1.103 + ; Prefetch data 1.104 + lea rcx, [rax+rax*2] 1.105 + prefetcht0 [rsi] 1.106 + prefetcht0 [rsi+rax] 1.107 + prefetcht0 [rsi+rax*2] 1.108 + prefetcht0 [rsi+rcx] 1.109 + lea rbx, [rsi+rax*4] 1.110 + prefetcht0 [rbx] 1.111 + prefetcht0 [rbx+rax] 1.112 + prefetcht0 [rbx+rax*2] 1.113 + prefetcht0 [rbx+rcx] 1.114 + 1.115 + lea rcx, [rdx+rdx*2] 1.116 + prefetcht0 [rdi] 1.117 + prefetcht0 [rdi+rdx] 1.118 + prefetcht0 [rdi+rdx*2] 1.119 + prefetcht0 [rdi+rcx] 1.120 + lea rbx, [rdi+rdx*4] 1.121 + prefetcht0 [rbx] 1.122 + prefetcht0 [rbx+rdx] 1.123 + prefetcht0 [rbx+rdx*2] 1.124 + prefetcht0 [rbx+rcx] 1.125 + 1.126 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.127 + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 1.128 + 1.129 + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 1.130 + mov rcx, 16 1.131 + 1.132 +.var16loop: 1.133 + movdqu xmm1, XMMWORD PTR [rsi] 1.134 + movdqu xmm2, XMMWORD PTR [rdi] 1.135 + 1.136 + prefetcht0 [rsi+rax*8] 1.137 + prefetcht0 [rdi+rdx*8] 1.138 + 1.139 + movdqa xmm3, xmm1 1.140 + movdqa xmm4, xmm2 1.141 + 1.142 + 1.143 + punpcklbw xmm1, xmm0 1.144 + punpckhbw xmm3, xmm0 1.145 + 1.146 + punpcklbw xmm2, xmm0 1.147 + punpckhbw xmm4, xmm0 1.148 + 1.149 + 1.150 + psubw xmm1, xmm2 1.151 + psubw xmm3, xmm4 1.152 + 1.153 + paddw xmm7, xmm1 1.154 + pmaddwd xmm1, xmm1 1.155 + 1.156 + paddw xmm7, xmm3 1.157 + pmaddwd xmm3, xmm3 1.158 + 1.159 + paddd xmm6, xmm1 1.160 + paddd xmm6, xmm3 1.161 + 1.162 + add rsi, rax 1.163 + add rdi, rdx 1.164 + 1.165 + sub rcx, 1 1.166 + jnz .var16loop 1.167 + 1.168 + 1.169 + movdqa xmm1, xmm6 1.170 + pxor xmm6, xmm6 1.171 + 1.172 + pxor xmm5, xmm5 1.173 + punpcklwd xmm6, xmm7 1.174 + 1.175 + punpckhwd xmm5, xmm7 1.176 + psrad xmm5, 16 1.177 + 1.178 + psrad xmm6, 16 1.179 + paddd xmm6, xmm5 1.180 + 1.181 + movdqa xmm2, xmm1 1.182 + punpckldq xmm1, xmm0 1.183 + 1.184 + punpckhdq xmm2, xmm0 1.185 + movdqa xmm7, xmm6 1.186 + 1.187 + paddd xmm1, xmm2 1.188 + punpckldq xmm6, xmm0 1.189 + 1.190 + punpckhdq xmm7, xmm0 1.191 + paddd xmm6, xmm7 1.192 + 1.193 + movdqa xmm2, xmm1 1.194 + movdqa xmm7, xmm6 1.195 + 1.196 + psrldq xmm1, 8 1.197 + psrldq xmm6, 8 1.198 + 1.199 + paddd xmm7, xmm6 1.200 + paddd xmm1, xmm2 1.201 + 1.202 + mov rax, arg(5) ;[Sum] 1.203 + mov rdi, arg(4) ;[SSE] 1.204 + 1.205 + movd DWORD PTR [rax], xmm7 1.206 + movd DWORD PTR [rdi], xmm1 1.207 + 1.208 + 1.209 + ; begin epilog 1.210 + pop rdi 1.211 + pop rsi 1.212 + pop rbx 1.213 + RESTORE_XMM 1.214 + UNSHADOW_ARGS 1.215 + pop rbp 1.216 + ret 1.217 + 1.218 + 1.219 + 1.220 + 1.221 +;unsigned int vp8_get8x8var_sse2 1.222 +;( 1.223 +; unsigned char * src_ptr, 1.224 +; int source_stride, 1.225 +; unsigned char * ref_ptr, 1.226 +; int recon_stride, 1.227 +; unsigned int * SSE, 1.228 +; int * Sum 1.229 +;) 1.230 +global sym(vp8_get8x8var_sse2) PRIVATE 1.231 +sym(vp8_get8x8var_sse2): 1.232 + push rbp 1.233 + mov rbp, rsp 1.234 + SHADOW_ARGS_TO_STACK 6 1.235 + SAVE_XMM 7 1.236 + GET_GOT rbx 1.237 + push rsi 1.238 + push rdi 1.239 + sub rsp, 16 1.240 + ; end prolog 1.241 + 1.242 + mov rsi, arg(0) ;[src_ptr] 1.243 + mov rdi, arg(2) ;[ref_ptr] 1.244 + 1.245 + movsxd rax, DWORD PTR arg(1) ;[source_stride] 1.246 + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 1.247 + 1.248 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.249 + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 1.250 + 1.251 + movq xmm1, QWORD PTR [rsi] 1.252 + movq xmm2, QWORD PTR [rdi] 1.253 + 1.254 + punpcklbw xmm1, xmm0 1.255 + punpcklbw xmm2, xmm0 1.256 + 1.257 + psubsw xmm1, xmm2 1.258 + paddw xmm7, xmm1 1.259 + 1.260 + pmaddwd xmm1, xmm1 1.261 + 1.262 + movq xmm2, QWORD PTR[rsi + rax] 1.263 + movq xmm3, QWORD PTR[rdi + rdx] 1.264 + 1.265 + punpcklbw xmm2, xmm0 1.266 + punpcklbw xmm3, xmm0 1.267 + 1.268 + psubsw xmm2, xmm3 1.269 + paddw xmm7, xmm2 1.270 + 1.271 + pmaddwd xmm2, xmm2 1.272 + paddd xmm1, xmm2 1.273 + 1.274 + 1.275 + movq xmm2, QWORD PTR[rsi + rax * 2] 1.276 + movq xmm3, QWORD PTR[rdi + rdx * 2] 1.277 + 1.278 + punpcklbw xmm2, xmm0 1.279 + punpcklbw xmm3, xmm0 1.280 + 1.281 + psubsw xmm2, xmm3 1.282 + paddw xmm7, xmm2 1.283 + 1.284 + pmaddwd xmm2, xmm2 1.285 + paddd xmm1, xmm2 1.286 + 1.287 + 1.288 + lea rsi, [rsi + rax * 2] 1.289 + lea rdi, [rdi + rdx * 2] 1.290 + movq xmm2, QWORD PTR[rsi + rax] 1.291 + movq xmm3, QWORD PTR[rdi + rdx] 1.292 + 1.293 + punpcklbw xmm2, xmm0 1.294 + punpcklbw xmm3, xmm0 1.295 + 1.296 + psubsw xmm2, xmm3 1.297 + paddw xmm7, xmm2 1.298 + 1.299 + pmaddwd xmm2, xmm2 1.300 + paddd xmm1, xmm2 1.301 + 1.302 + movq xmm2, QWORD PTR[rsi + rax *2] 1.303 + movq xmm3, QWORD PTR[rdi + rdx *2] 1.304 + 1.305 + punpcklbw xmm2, xmm0 1.306 + punpcklbw xmm3, xmm0 1.307 + 1.308 + psubsw xmm2, xmm3 1.309 + paddw xmm7, xmm2 1.310 + 1.311 + pmaddwd xmm2, xmm2 1.312 + paddd xmm1, xmm2 1.313 + 1.314 + 1.315 + lea rsi, [rsi + rax * 2] 1.316 + lea rdi, [rdi + rdx * 2] 1.317 + 1.318 + 1.319 + movq xmm2, QWORD PTR[rsi + rax] 1.320 + movq xmm3, QWORD PTR[rdi + rdx] 1.321 + 1.322 + punpcklbw xmm2, xmm0 1.323 + punpcklbw xmm3, xmm0 1.324 + 1.325 + psubsw xmm2, xmm3 1.326 + paddw xmm7, xmm2 1.327 + 1.328 + pmaddwd xmm2, xmm2 1.329 + paddd xmm1, xmm2 1.330 + 1.331 + movq xmm2, QWORD PTR[rsi + rax *2] 1.332 + movq xmm3, QWORD PTR[rdi + rdx *2] 1.333 + 1.334 + punpcklbw xmm2, xmm0 1.335 + punpcklbw xmm3, xmm0 1.336 + 1.337 + psubsw xmm2, xmm3 1.338 + paddw xmm7, xmm2 1.339 + 1.340 + pmaddwd xmm2, xmm2 1.341 + paddd xmm1, xmm2 1.342 + 1.343 + 1.344 + lea rsi, [rsi + rax * 2] 1.345 + lea rdi, [rdi + rdx * 2] 1.346 + 1.347 + movq xmm2, QWORD PTR[rsi + rax] 1.348 + movq xmm3, QWORD PTR[rdi + rdx] 1.349 + 1.350 + punpcklbw xmm2, xmm0 1.351 + punpcklbw xmm3, xmm0 1.352 + 1.353 + psubsw xmm2, xmm3 1.354 + paddw xmm7, xmm2 1.355 + 1.356 + pmaddwd xmm2, xmm2 1.357 + paddd xmm1, xmm2 1.358 + 1.359 + 1.360 + movdqa xmm6, xmm7 1.361 + punpcklwd xmm6, xmm0 1.362 + 1.363 + punpckhwd xmm7, xmm0 1.364 + movdqa xmm2, xmm1 1.365 + 1.366 + paddw xmm6, xmm7 1.367 + punpckldq xmm1, xmm0 1.368 + 1.369 + punpckhdq xmm2, xmm0 1.370 + movdqa xmm7, xmm6 1.371 + 1.372 + paddd xmm1, xmm2 1.373 + punpckldq xmm6, xmm0 1.374 + 1.375 + punpckhdq xmm7, xmm0 1.376 + paddw xmm6, xmm7 1.377 + 1.378 + movdqa xmm2, xmm1 1.379 + movdqa xmm7, xmm6 1.380 + 1.381 + psrldq xmm1, 8 1.382 + psrldq xmm6, 8 1.383 + 1.384 + paddw xmm7, xmm6 1.385 + paddd xmm1, xmm2 1.386 + 1.387 + mov rax, arg(5) ;[Sum] 1.388 + mov rdi, arg(4) ;[SSE] 1.389 + 1.390 + movq rdx, xmm7 1.391 + movsx rcx, dx 1.392 + 1.393 + mov dword ptr [rax], ecx 1.394 + movd DWORD PTR [rdi], xmm1 1.395 + 1.396 + ; begin epilog 1.397 + add rsp, 16 1.398 + pop rdi 1.399 + pop rsi 1.400 + RESTORE_GOT 1.401 + RESTORE_XMM 1.402 + UNSHADOW_ARGS 1.403 + pop rbp 1.404 + ret 1.405 + 1.406 +;void vp8_filter_block2d_bil_var_sse2 1.407 +;( 1.408 +; unsigned char *ref_ptr, 1.409 +; int ref_pixels_per_line, 1.410 +; unsigned char *src_ptr, 1.411 +; int src_pixels_per_line, 1.412 +; unsigned int Height, 1.413 +; int xoffset, 1.414 +; int yoffset, 1.415 +; int *sum, 1.416 +; unsigned int *sumsquared;; 1.417 +; 1.418 +;) 1.419 +global sym(vp8_filter_block2d_bil_var_sse2) PRIVATE 1.420 +sym(vp8_filter_block2d_bil_var_sse2): 1.421 + push rbp 1.422 + mov rbp, rsp 1.423 + SHADOW_ARGS_TO_STACK 9 1.424 + SAVE_XMM 7 1.425 + GET_GOT rbx 1.426 + push rsi 1.427 + push rdi 1.428 + push rbx 1.429 + ; end prolog 1.430 + 1.431 + pxor xmm6, xmm6 ; 1.432 + pxor xmm7, xmm7 ; 1.433 + 1.434 + lea rsi, [GLOBAL(xmm_bi_rd)] ; rounding 1.435 + movdqa xmm4, XMMWORD PTR [rsi] 1.436 + 1.437 + lea rcx, [GLOBAL(vp8_bilinear_filters_sse2)] 1.438 + movsxd rax, dword ptr arg(5) ; xoffset 1.439 + 1.440 + cmp rax, 0 ; skip first_pass filter if xoffset=0 1.441 + je filter_block2d_bil_var_sse2_sp_only 1.442 + 1.443 + shl rax, 5 ; point to filter coeff with xoffset 1.444 + lea rax, [rax + rcx] ; HFilter 1.445 + 1.446 + movsxd rdx, dword ptr arg(6) ; yoffset 1.447 + 1.448 + cmp rdx, 0 ; skip second_pass filter if yoffset=0 1.449 + je filter_block2d_bil_var_sse2_fp_only 1.450 + 1.451 + shl rdx, 5 1.452 + lea rdx, [rdx + rcx] ; VFilter 1.453 + 1.454 + mov rsi, arg(0) ;ref_ptr 1.455 + mov rdi, arg(2) ;src_ptr 1.456 + movsxd rcx, dword ptr arg(4) ;Height 1.457 + 1.458 + pxor xmm0, xmm0 ; 1.459 + movq xmm1, QWORD PTR [rsi] ; 1.460 + movq xmm3, QWORD PTR [rsi+1] ; 1.461 + 1.462 + punpcklbw xmm1, xmm0 ; 1.463 + pmullw xmm1, [rax] ; 1.464 + punpcklbw xmm3, xmm0 1.465 + pmullw xmm3, [rax+16] ; 1.466 + 1.467 + paddw xmm1, xmm3 ; 1.468 + paddw xmm1, xmm4 ; 1.469 + psraw xmm1, xmm_filter_shift ; 1.470 + movdqa xmm5, xmm1 1.471 + 1.472 + movsxd rbx, dword ptr arg(1) ;ref_pixels_per_line 1.473 + lea rsi, [rsi + rbx] 1.474 +%if ABI_IS_32BIT=0 1.475 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.476 +%endif 1.477 + 1.478 +filter_block2d_bil_var_sse2_loop: 1.479 + movq xmm1, QWORD PTR [rsi] ; 1.480 + movq xmm3, QWORD PTR [rsi+1] ; 1.481 + 1.482 + punpcklbw xmm1, xmm0 ; 1.483 + pmullw xmm1, [rax] ; 1.484 + punpcklbw xmm3, xmm0 ; 1.485 + pmullw xmm3, [rax+16] ; 1.486 + 1.487 + paddw xmm1, xmm3 ; 1.488 + paddw xmm1, xmm4 ; 1.489 + psraw xmm1, xmm_filter_shift ; 1.490 + 1.491 + movdqa xmm3, xmm5 ; 1.492 + movdqa xmm5, xmm1 ; 1.493 + 1.494 + pmullw xmm3, [rdx] ; 1.495 + pmullw xmm1, [rdx+16] ; 1.496 + paddw xmm1, xmm3 ; 1.497 + paddw xmm1, xmm4 ; 1.498 + psraw xmm1, xmm_filter_shift ; 1.499 + 1.500 + movq xmm3, QWORD PTR [rdi] ; 1.501 + punpcklbw xmm3, xmm0 ; 1.502 + 1.503 + psubw xmm1, xmm3 ; 1.504 + paddw xmm6, xmm1 ; 1.505 + 1.506 + pmaddwd xmm1, xmm1 ; 1.507 + paddd xmm7, xmm1 ; 1.508 + 1.509 + lea rsi, [rsi + rbx] ;ref_pixels_per_line 1.510 +%if ABI_IS_32BIT 1.511 + add rdi, dword ptr arg(3) ;src_pixels_per_line 1.512 +%else 1.513 + lea rdi, [rdi + r9] 1.514 +%endif 1.515 + 1.516 + sub rcx, 1 ; 1.517 + jnz filter_block2d_bil_var_sse2_loop ; 1.518 + 1.519 + jmp filter_block2d_bil_variance 1.520 + 1.521 +filter_block2d_bil_var_sse2_sp_only: 1.522 + movsxd rdx, dword ptr arg(6) ; yoffset 1.523 + 1.524 + cmp rdx, 0 ; skip all if both xoffset=0 and yoffset=0 1.525 + je filter_block2d_bil_var_sse2_full_pixel 1.526 + 1.527 + shl rdx, 5 1.528 + lea rdx, [rdx + rcx] ; VFilter 1.529 + 1.530 + mov rsi, arg(0) ;ref_ptr 1.531 + mov rdi, arg(2) ;src_ptr 1.532 + movsxd rcx, dword ptr arg(4) ;Height 1.533 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.534 + 1.535 + pxor xmm0, xmm0 ; 1.536 + movq xmm1, QWORD PTR [rsi] ; 1.537 + punpcklbw xmm1, xmm0 ; 1.538 + 1.539 + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 1.540 + lea rsi, [rsi + rax] 1.541 + 1.542 +filter_block2d_bil_sp_only_loop: 1.543 + movq xmm3, QWORD PTR [rsi] ; 1.544 + punpcklbw xmm3, xmm0 ; 1.545 + movdqa xmm5, xmm3 1.546 + 1.547 + pmullw xmm1, [rdx] ; 1.548 + pmullw xmm3, [rdx+16] ; 1.549 + paddw xmm1, xmm3 ; 1.550 + paddw xmm1, xmm4 ; 1.551 + psraw xmm1, xmm_filter_shift ; 1.552 + 1.553 + movq xmm3, QWORD PTR [rdi] ; 1.554 + punpcklbw xmm3, xmm0 ; 1.555 + 1.556 + psubw xmm1, xmm3 ; 1.557 + paddw xmm6, xmm1 ; 1.558 + 1.559 + pmaddwd xmm1, xmm1 ; 1.560 + paddd xmm7, xmm1 ; 1.561 + 1.562 + movdqa xmm1, xmm5 ; 1.563 + lea rsi, [rsi + rax] ;ref_pixels_per_line 1.564 + lea rdi, [rdi + rbx] ;src_pixels_per_line 1.565 + 1.566 + sub rcx, 1 ; 1.567 + jnz filter_block2d_bil_sp_only_loop ; 1.568 + 1.569 + jmp filter_block2d_bil_variance 1.570 + 1.571 +filter_block2d_bil_var_sse2_full_pixel: 1.572 + mov rsi, arg(0) ;ref_ptr 1.573 + mov rdi, arg(2) ;src_ptr 1.574 + movsxd rcx, dword ptr arg(4) ;Height 1.575 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.576 + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 1.577 + pxor xmm0, xmm0 ; 1.578 + 1.579 +filter_block2d_bil_full_pixel_loop: 1.580 + movq xmm1, QWORD PTR [rsi] ; 1.581 + punpcklbw xmm1, xmm0 ; 1.582 + 1.583 + movq xmm2, QWORD PTR [rdi] ; 1.584 + punpcklbw xmm2, xmm0 ; 1.585 + 1.586 + psubw xmm1, xmm2 ; 1.587 + paddw xmm6, xmm1 ; 1.588 + 1.589 + pmaddwd xmm1, xmm1 ; 1.590 + paddd xmm7, xmm1 ; 1.591 + 1.592 + lea rsi, [rsi + rax] ;ref_pixels_per_line 1.593 + lea rdi, [rdi + rbx] ;src_pixels_per_line 1.594 + 1.595 + sub rcx, 1 ; 1.596 + jnz filter_block2d_bil_full_pixel_loop ; 1.597 + 1.598 + jmp filter_block2d_bil_variance 1.599 + 1.600 +filter_block2d_bil_var_sse2_fp_only: 1.601 + mov rsi, arg(0) ;ref_ptr 1.602 + mov rdi, arg(2) ;src_ptr 1.603 + movsxd rcx, dword ptr arg(4) ;Height 1.604 + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line 1.605 + 1.606 + pxor xmm0, xmm0 ; 1.607 + movsxd rbx, dword ptr arg(3) ;src_pixels_per_line 1.608 + 1.609 +filter_block2d_bil_fp_only_loop: 1.610 + movq xmm1, QWORD PTR [rsi] ; 1.611 + movq xmm3, QWORD PTR [rsi+1] ; 1.612 + 1.613 + punpcklbw xmm1, xmm0 ; 1.614 + pmullw xmm1, [rax] ; 1.615 + punpcklbw xmm3, xmm0 ; 1.616 + pmullw xmm3, [rax+16] ; 1.617 + 1.618 + paddw xmm1, xmm3 ; 1.619 + paddw xmm1, xmm4 ; 1.620 + psraw xmm1, xmm_filter_shift ; 1.621 + 1.622 + movq xmm3, QWORD PTR [rdi] ; 1.623 + punpcklbw xmm3, xmm0 ; 1.624 + 1.625 + psubw xmm1, xmm3 ; 1.626 + paddw xmm6, xmm1 ; 1.627 + 1.628 + pmaddwd xmm1, xmm1 ; 1.629 + paddd xmm7, xmm1 ; 1.630 + lea rsi, [rsi + rdx] 1.631 + lea rdi, [rdi + rbx] ;src_pixels_per_line 1.632 + 1.633 + sub rcx, 1 ; 1.634 + jnz filter_block2d_bil_fp_only_loop ; 1.635 + 1.636 + jmp filter_block2d_bil_variance 1.637 + 1.638 +filter_block2d_bil_variance: 1.639 + movdq2q mm6, xmm6 ; 1.640 + movdq2q mm7, xmm7 ; 1.641 + 1.642 + psrldq xmm6, 8 1.643 + psrldq xmm7, 8 1.644 + 1.645 + movdq2q mm2, xmm6 1.646 + movdq2q mm3, xmm7 1.647 + 1.648 + paddw mm6, mm2 1.649 + paddd mm7, mm3 1.650 + 1.651 + pxor mm3, mm3 ; 1.652 + pxor mm2, mm2 ; 1.653 + 1.654 + punpcklwd mm2, mm6 ; 1.655 + punpckhwd mm3, mm6 ; 1.656 + 1.657 + paddd mm2, mm3 ; 1.658 + movq mm6, mm2 ; 1.659 + 1.660 + psrlq mm6, 32 ; 1.661 + paddd mm2, mm6 ; 1.662 + 1.663 + psrad mm2, 16 ; 1.664 + movq mm4, mm7 ; 1.665 + 1.666 + psrlq mm4, 32 ; 1.667 + paddd mm4, mm7 ; 1.668 + 1.669 + mov rsi, arg(7) ; sum 1.670 + mov rdi, arg(8) ; sumsquared 1.671 + 1.672 + movd [rsi], mm2 ; xsum 1.673 + movd [rdi], mm4 ; xxsum 1.674 + 1.675 + ; begin epilog 1.676 + pop rbx 1.677 + pop rdi 1.678 + pop rsi 1.679 + RESTORE_GOT 1.680 + RESTORE_XMM 1.681 + UNSHADOW_ARGS 1.682 + pop rbp 1.683 + ret 1.684 + 1.685 + 1.686 +;void vp8_half_horiz_vert_variance8x_h_sse2 1.687 +;( 1.688 +; unsigned char *ref_ptr, 1.689 +; int ref_pixels_per_line, 1.690 +; unsigned char *src_ptr, 1.691 +; int src_pixels_per_line, 1.692 +; unsigned int Height, 1.693 +; int *sum, 1.694 +; unsigned int *sumsquared 1.695 +;) 1.696 +global sym(vp8_half_horiz_vert_variance8x_h_sse2) PRIVATE 1.697 +sym(vp8_half_horiz_vert_variance8x_h_sse2): 1.698 + push rbp 1.699 + mov rbp, rsp 1.700 + SHADOW_ARGS_TO_STACK 7 1.701 + SAVE_XMM 7 1.702 + GET_GOT rbx 1.703 + push rsi 1.704 + push rdi 1.705 + ; end prolog 1.706 + 1.707 +%if ABI_IS_32BIT=0 1.708 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.709 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.710 +%endif 1.711 + 1.712 + pxor xmm6, xmm6 ; error accumulator 1.713 + pxor xmm7, xmm7 ; sse eaccumulator 1.714 + mov rsi, arg(0) ;ref_ptr ; 1.715 + 1.716 + mov rdi, arg(2) ;src_ptr ; 1.717 + movsxd rcx, dword ptr arg(4) ;Height ; 1.718 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.719 + 1.720 + pxor xmm0, xmm0 ; 1.721 + 1.722 + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1.723 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 1.724 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 1.725 + 1.726 +%if ABI_IS_32BIT 1.727 + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.728 +%else 1.729 + add rsi, r8 1.730 +%endif 1.731 + 1.732 +vp8_half_horiz_vert_variance8x_h_1: 1.733 + 1.734 + movq xmm1, QWORD PTR [rsi] ; 1.735 + movq xmm2, QWORD PTR [rsi+1] ; 1.736 + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 1.737 + 1.738 + pavgb xmm5, xmm1 ; xmm = vertical average of the above 1.739 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.740 + 1.741 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1.742 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.743 + 1.744 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.745 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.746 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.747 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.748 + 1.749 + movdqa xmm5, xmm1 ; save xmm1 for use on the next row 1.750 + 1.751 +%if ABI_IS_32BIT 1.752 + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.753 + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1.754 +%else 1.755 + add rsi, r8 1.756 + add rdi, r9 1.757 +%endif 1.758 + 1.759 + sub rcx, 1 ; 1.760 + jnz vp8_half_horiz_vert_variance8x_h_1 ; 1.761 + 1.762 + movdq2q mm6, xmm6 ; 1.763 + movdq2q mm7, xmm7 ; 1.764 + 1.765 + psrldq xmm6, 8 1.766 + psrldq xmm7, 8 1.767 + 1.768 + movdq2q mm2, xmm6 1.769 + movdq2q mm3, xmm7 1.770 + 1.771 + paddw mm6, mm2 1.772 + paddd mm7, mm3 1.773 + 1.774 + pxor mm3, mm3 ; 1.775 + pxor mm2, mm2 ; 1.776 + 1.777 + punpcklwd mm2, mm6 ; 1.778 + punpckhwd mm3, mm6 ; 1.779 + 1.780 + paddd mm2, mm3 ; 1.781 + movq mm6, mm2 ; 1.782 + 1.783 + psrlq mm6, 32 ; 1.784 + paddd mm2, mm6 ; 1.785 + 1.786 + psrad mm2, 16 ; 1.787 + movq mm4, mm7 ; 1.788 + 1.789 + psrlq mm4, 32 ; 1.790 + paddd mm4, mm7 ; 1.791 + 1.792 + mov rsi, arg(5) ; sum 1.793 + mov rdi, arg(6) ; sumsquared 1.794 + 1.795 + movd [rsi], mm2 ; 1.796 + movd [rdi], mm4 ; 1.797 + 1.798 + 1.799 + ; begin epilog 1.800 + pop rdi 1.801 + pop rsi 1.802 + RESTORE_GOT 1.803 + RESTORE_XMM 1.804 + UNSHADOW_ARGS 1.805 + pop rbp 1.806 + ret 1.807 + 1.808 +;void vp8_half_horiz_vert_variance16x_h_sse2 1.809 +;( 1.810 +; unsigned char *ref_ptr, 1.811 +; int ref_pixels_per_line, 1.812 +; unsigned char *src_ptr, 1.813 +; int src_pixels_per_line, 1.814 +; unsigned int Height, 1.815 +; int *sum, 1.816 +; unsigned int *sumsquared 1.817 +;) 1.818 +global sym(vp8_half_horiz_vert_variance16x_h_sse2) PRIVATE 1.819 +sym(vp8_half_horiz_vert_variance16x_h_sse2): 1.820 + push rbp 1.821 + mov rbp, rsp 1.822 + SHADOW_ARGS_TO_STACK 7 1.823 + SAVE_XMM 7 1.824 + GET_GOT rbx 1.825 + push rsi 1.826 + push rdi 1.827 + ; end prolog 1.828 + 1.829 + pxor xmm6, xmm6 ; error accumulator 1.830 + pxor xmm7, xmm7 ; sse eaccumulator 1.831 + mov rsi, arg(0) ;ref_ptr ; 1.832 + 1.833 + mov rdi, arg(2) ;src_ptr ; 1.834 + movsxd rcx, dword ptr arg(4) ;Height ; 1.835 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.836 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.837 + 1.838 + pxor xmm0, xmm0 ; 1.839 + 1.840 + movdqu xmm5, XMMWORD PTR [rsi] 1.841 + movdqu xmm3, XMMWORD PTR [rsi+1] 1.842 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 1.843 + 1.844 + lea rsi, [rsi + rax] 1.845 + 1.846 +vp8_half_horiz_vert_variance16x_h_1: 1.847 + movdqu xmm1, XMMWORD PTR [rsi] ; 1.848 + movdqu xmm2, XMMWORD PTR [rsi+1] ; 1.849 + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 1.850 + 1.851 + pavgb xmm5, xmm1 ; xmm = vertical average of the above 1.852 + 1.853 + movdqa xmm4, xmm5 1.854 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.855 + punpckhbw xmm4, xmm0 1.856 + 1.857 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 1.858 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.859 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.860 + 1.861 + movq xmm3, QWORD PTR [rdi+8] 1.862 + punpcklbw xmm3, xmm0 1.863 + psubw xmm4, xmm3 1.864 + 1.865 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.866 + paddw xmm6, xmm4 1.867 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.868 + pmaddwd xmm4, xmm4 1.869 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.870 + paddd xmm7, xmm4 1.871 + 1.872 + movdqa xmm5, xmm1 ; save xmm1 for use on the next row 1.873 + 1.874 + lea rsi, [rsi + rax] 1.875 + lea rdi, [rdi + rdx] 1.876 + 1.877 + sub rcx, 1 ; 1.878 + jnz vp8_half_horiz_vert_variance16x_h_1 ; 1.879 + 1.880 + pxor xmm1, xmm1 1.881 + pxor xmm5, xmm5 1.882 + 1.883 + punpcklwd xmm0, xmm6 1.884 + punpckhwd xmm1, xmm6 1.885 + psrad xmm0, 16 1.886 + psrad xmm1, 16 1.887 + paddd xmm0, xmm1 1.888 + movdqa xmm1, xmm0 1.889 + 1.890 + movdqa xmm6, xmm7 1.891 + punpckldq xmm6, xmm5 1.892 + punpckhdq xmm7, xmm5 1.893 + paddd xmm6, xmm7 1.894 + 1.895 + punpckldq xmm0, xmm5 1.896 + punpckhdq xmm1, xmm5 1.897 + paddd xmm0, xmm1 1.898 + 1.899 + movdqa xmm7, xmm6 1.900 + movdqa xmm1, xmm0 1.901 + 1.902 + psrldq xmm7, 8 1.903 + psrldq xmm1, 8 1.904 + 1.905 + paddd xmm6, xmm7 1.906 + paddd xmm0, xmm1 1.907 + 1.908 + mov rsi, arg(5) ;[Sum] 1.909 + mov rdi, arg(6) ;[SSE] 1.910 + 1.911 + movd [rsi], xmm0 1.912 + movd [rdi], xmm6 1.913 + 1.914 + ; begin epilog 1.915 + pop rdi 1.916 + pop rsi 1.917 + RESTORE_GOT 1.918 + RESTORE_XMM 1.919 + UNSHADOW_ARGS 1.920 + pop rbp 1.921 + ret 1.922 + 1.923 + 1.924 +;void vp8_half_vert_variance8x_h_sse2 1.925 +;( 1.926 +; unsigned char *ref_ptr, 1.927 +; int ref_pixels_per_line, 1.928 +; unsigned char *src_ptr, 1.929 +; int src_pixels_per_line, 1.930 +; unsigned int Height, 1.931 +; int *sum, 1.932 +; unsigned int *sumsquared 1.933 +;) 1.934 +global sym(vp8_half_vert_variance8x_h_sse2) PRIVATE 1.935 +sym(vp8_half_vert_variance8x_h_sse2): 1.936 + push rbp 1.937 + mov rbp, rsp 1.938 + SHADOW_ARGS_TO_STACK 7 1.939 + SAVE_XMM 7 1.940 + GET_GOT rbx 1.941 + push rsi 1.942 + push rdi 1.943 + ; end prolog 1.944 + 1.945 +%if ABI_IS_32BIT=0 1.946 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.947 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.948 +%endif 1.949 + 1.950 + pxor xmm6, xmm6 ; error accumulator 1.951 + pxor xmm7, xmm7 ; sse eaccumulator 1.952 + mov rsi, arg(0) ;ref_ptr ; 1.953 + 1.954 + mov rdi, arg(2) ;src_ptr ; 1.955 + movsxd rcx, dword ptr arg(4) ;Height ; 1.956 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.957 + 1.958 + pxor xmm0, xmm0 ; 1.959 +vp8_half_vert_variance8x_h_1: 1.960 + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1.961 + movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 1.962 + 1.963 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.964 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.965 + 1.966 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1.967 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.968 + 1.969 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.970 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.971 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.972 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.973 + 1.974 +%if ABI_IS_32BIT 1.975 + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.976 + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1.977 +%else 1.978 + add rsi, r8 1.979 + add rdi, r9 1.980 +%endif 1.981 + 1.982 + sub rcx, 1 ; 1.983 + jnz vp8_half_vert_variance8x_h_1 ; 1.984 + 1.985 + movdq2q mm6, xmm6 ; 1.986 + movdq2q mm7, xmm7 ; 1.987 + 1.988 + psrldq xmm6, 8 1.989 + psrldq xmm7, 8 1.990 + 1.991 + movdq2q mm2, xmm6 1.992 + movdq2q mm3, xmm7 1.993 + 1.994 + paddw mm6, mm2 1.995 + paddd mm7, mm3 1.996 + 1.997 + pxor mm3, mm3 ; 1.998 + pxor mm2, mm2 ; 1.999 + 1.1000 + punpcklwd mm2, mm6 ; 1.1001 + punpckhwd mm3, mm6 ; 1.1002 + 1.1003 + paddd mm2, mm3 ; 1.1004 + movq mm6, mm2 ; 1.1005 + 1.1006 + psrlq mm6, 32 ; 1.1007 + paddd mm2, mm6 ; 1.1008 + 1.1009 + psrad mm2, 16 ; 1.1010 + movq mm4, mm7 ; 1.1011 + 1.1012 + psrlq mm4, 32 ; 1.1013 + paddd mm4, mm7 ; 1.1014 + 1.1015 + mov rsi, arg(5) ; sum 1.1016 + mov rdi, arg(6) ; sumsquared 1.1017 + 1.1018 + movd [rsi], mm2 ; 1.1019 + movd [rdi], mm4 ; 1.1020 + 1.1021 + 1.1022 + ; begin epilog 1.1023 + pop rdi 1.1024 + pop rsi 1.1025 + RESTORE_GOT 1.1026 + RESTORE_XMM 1.1027 + UNSHADOW_ARGS 1.1028 + pop rbp 1.1029 + ret 1.1030 + 1.1031 +;void vp8_half_vert_variance16x_h_sse2 1.1032 +;( 1.1033 +; unsigned char *ref_ptr, 1.1034 +; int ref_pixels_per_line, 1.1035 +; unsigned char *src_ptr, 1.1036 +; int src_pixels_per_line, 1.1037 +; unsigned int Height, 1.1038 +; int *sum, 1.1039 +; unsigned int *sumsquared 1.1040 +;) 1.1041 +global sym(vp8_half_vert_variance16x_h_sse2) PRIVATE 1.1042 +sym(vp8_half_vert_variance16x_h_sse2): 1.1043 + push rbp 1.1044 + mov rbp, rsp 1.1045 + SHADOW_ARGS_TO_STACK 7 1.1046 + SAVE_XMM 7 1.1047 + GET_GOT rbx 1.1048 + push rsi 1.1049 + push rdi 1.1050 + ; end prolog 1.1051 + 1.1052 + pxor xmm6, xmm6 ; error accumulator 1.1053 + pxor xmm7, xmm7 ; sse eaccumulator 1.1054 + mov rsi, arg(0) ;ref_ptr 1.1055 + 1.1056 + mov rdi, arg(2) ;src_ptr 1.1057 + movsxd rcx, dword ptr arg(4) ;Height 1.1058 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.1059 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.1060 + 1.1061 + movdqu xmm5, XMMWORD PTR [rsi] 1.1062 + lea rsi, [rsi + rax ] 1.1063 + pxor xmm0, xmm0 1.1064 + 1.1065 +vp8_half_vert_variance16x_h_1: 1.1066 + movdqu xmm3, XMMWORD PTR [rsi] 1.1067 + 1.1068 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.1069 + movdqa xmm4, xmm5 1.1070 + punpcklbw xmm5, xmm0 1.1071 + punpckhbw xmm4, xmm0 1.1072 + 1.1073 + movq xmm2, QWORD PTR [rdi] 1.1074 + punpcklbw xmm2, xmm0 1.1075 + psubw xmm5, xmm2 1.1076 + movq xmm2, QWORD PTR [rdi+8] 1.1077 + punpcklbw xmm2, xmm0 1.1078 + psubw xmm4, xmm2 1.1079 + 1.1080 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.1081 + paddw xmm6, xmm4 1.1082 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.1083 + pmaddwd xmm4, xmm4 1.1084 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.1085 + paddd xmm7, xmm4 1.1086 + 1.1087 + movdqa xmm5, xmm3 1.1088 + 1.1089 + lea rsi, [rsi + rax] 1.1090 + lea rdi, [rdi + rdx] 1.1091 + 1.1092 + sub rcx, 1 1.1093 + jnz vp8_half_vert_variance16x_h_1 1.1094 + 1.1095 + pxor xmm1, xmm1 1.1096 + pxor xmm5, xmm5 1.1097 + 1.1098 + punpcklwd xmm0, xmm6 1.1099 + punpckhwd xmm1, xmm6 1.1100 + psrad xmm0, 16 1.1101 + psrad xmm1, 16 1.1102 + paddd xmm0, xmm1 1.1103 + movdqa xmm1, xmm0 1.1104 + 1.1105 + movdqa xmm6, xmm7 1.1106 + punpckldq xmm6, xmm5 1.1107 + punpckhdq xmm7, xmm5 1.1108 + paddd xmm6, xmm7 1.1109 + 1.1110 + punpckldq xmm0, xmm5 1.1111 + punpckhdq xmm1, xmm5 1.1112 + paddd xmm0, xmm1 1.1113 + 1.1114 + movdqa xmm7, xmm6 1.1115 + movdqa xmm1, xmm0 1.1116 + 1.1117 + psrldq xmm7, 8 1.1118 + psrldq xmm1, 8 1.1119 + 1.1120 + paddd xmm6, xmm7 1.1121 + paddd xmm0, xmm1 1.1122 + 1.1123 + mov rsi, arg(5) ;[Sum] 1.1124 + mov rdi, arg(6) ;[SSE] 1.1125 + 1.1126 + movd [rsi], xmm0 1.1127 + movd [rdi], xmm6 1.1128 + 1.1129 + ; begin epilog 1.1130 + pop rdi 1.1131 + pop rsi 1.1132 + RESTORE_GOT 1.1133 + RESTORE_XMM 1.1134 + UNSHADOW_ARGS 1.1135 + pop rbp 1.1136 + ret 1.1137 + 1.1138 + 1.1139 +;void vp8_half_horiz_variance8x_h_sse2 1.1140 +;( 1.1141 +; unsigned char *ref_ptr, 1.1142 +; int ref_pixels_per_line, 1.1143 +; unsigned char *src_ptr, 1.1144 +; int src_pixels_per_line, 1.1145 +; unsigned int Height, 1.1146 +; int *sum, 1.1147 +; unsigned int *sumsquared 1.1148 +;) 1.1149 +global sym(vp8_half_horiz_variance8x_h_sse2) PRIVATE 1.1150 +sym(vp8_half_horiz_variance8x_h_sse2): 1.1151 + push rbp 1.1152 + mov rbp, rsp 1.1153 + SHADOW_ARGS_TO_STACK 7 1.1154 + SAVE_XMM 7 1.1155 + GET_GOT rbx 1.1156 + push rsi 1.1157 + push rdi 1.1158 + ; end prolog 1.1159 + 1.1160 +%if ABI_IS_32BIT=0 1.1161 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.1162 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.1163 +%endif 1.1164 + 1.1165 + pxor xmm6, xmm6 ; error accumulator 1.1166 + pxor xmm7, xmm7 ; sse eaccumulator 1.1167 + mov rsi, arg(0) ;ref_ptr ; 1.1168 + 1.1169 + mov rdi, arg(2) ;src_ptr ; 1.1170 + movsxd rcx, dword ptr arg(4) ;Height ; 1.1171 + 1.1172 + pxor xmm0, xmm0 ; 1.1173 +vp8_half_horiz_variance8x_h_1: 1.1174 + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1.1175 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 1.1176 + 1.1177 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.1178 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.1179 + 1.1180 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1.1181 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.1182 + 1.1183 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.1184 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.1185 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.1186 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.1187 + 1.1188 +%if ABI_IS_32BIT 1.1189 + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.1190 + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1.1191 +%else 1.1192 + add rsi, r8 1.1193 + add rdi, r9 1.1194 +%endif 1.1195 + sub rcx, 1 ; 1.1196 + jnz vp8_half_horiz_variance8x_h_1 ; 1.1197 + 1.1198 + movdq2q mm6, xmm6 ; 1.1199 + movdq2q mm7, xmm7 ; 1.1200 + 1.1201 + psrldq xmm6, 8 1.1202 + psrldq xmm7, 8 1.1203 + 1.1204 + movdq2q mm2, xmm6 1.1205 + movdq2q mm3, xmm7 1.1206 + 1.1207 + paddw mm6, mm2 1.1208 + paddd mm7, mm3 1.1209 + 1.1210 + pxor mm3, mm3 ; 1.1211 + pxor mm2, mm2 ; 1.1212 + 1.1213 + punpcklwd mm2, mm6 ; 1.1214 + punpckhwd mm3, mm6 ; 1.1215 + 1.1216 + paddd mm2, mm3 ; 1.1217 + movq mm6, mm2 ; 1.1218 + 1.1219 + psrlq mm6, 32 ; 1.1220 + paddd mm2, mm6 ; 1.1221 + 1.1222 + psrad mm2, 16 ; 1.1223 + movq mm4, mm7 ; 1.1224 + 1.1225 + psrlq mm4, 32 ; 1.1226 + paddd mm4, mm7 ; 1.1227 + 1.1228 + mov rsi, arg(5) ; sum 1.1229 + mov rdi, arg(6) ; sumsquared 1.1230 + 1.1231 + movd [rsi], mm2 ; 1.1232 + movd [rdi], mm4 ; 1.1233 + 1.1234 + 1.1235 + ; begin epilog 1.1236 + pop rdi 1.1237 + pop rsi 1.1238 + RESTORE_GOT 1.1239 + RESTORE_XMM 1.1240 + UNSHADOW_ARGS 1.1241 + pop rbp 1.1242 + ret 1.1243 + 1.1244 +;void vp8_half_horiz_variance16x_h_sse2 1.1245 +;( 1.1246 +; unsigned char *ref_ptr, 1.1247 +; int ref_pixels_per_line, 1.1248 +; unsigned char *src_ptr, 1.1249 +; int src_pixels_per_line, 1.1250 +; unsigned int Height, 1.1251 +; int *sum, 1.1252 +; unsigned int *sumsquared 1.1253 +;) 1.1254 +global sym(vp8_half_horiz_variance16x_h_sse2) PRIVATE 1.1255 +sym(vp8_half_horiz_variance16x_h_sse2): 1.1256 + push rbp 1.1257 + mov rbp, rsp 1.1258 + SHADOW_ARGS_TO_STACK 7 1.1259 + SAVE_XMM 7 1.1260 + GET_GOT rbx 1.1261 + push rsi 1.1262 + push rdi 1.1263 + ; end prolog 1.1264 + 1.1265 + pxor xmm6, xmm6 ; error accumulator 1.1266 + pxor xmm7, xmm7 ; sse eaccumulator 1.1267 + mov rsi, arg(0) ;ref_ptr ; 1.1268 + 1.1269 + mov rdi, arg(2) ;src_ptr ; 1.1270 + movsxd rcx, dword ptr arg(4) ;Height ; 1.1271 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.1272 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.1273 + 1.1274 + pxor xmm0, xmm0 ; 1.1275 + 1.1276 +vp8_half_horiz_variance16x_h_1: 1.1277 + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 1.1278 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 1.1279 + 1.1280 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.1281 + movdqa xmm1, xmm5 1.1282 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.1283 + punpckhbw xmm1, xmm0 1.1284 + 1.1285 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 1.1286 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.1287 + movq xmm2, QWORD PTR [rdi+8] 1.1288 + punpcklbw xmm2, xmm0 1.1289 + 1.1290 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.1291 + psubw xmm1, xmm2 1.1292 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.1293 + paddw xmm6, xmm1 1.1294 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.1295 + pmaddwd xmm1, xmm1 1.1296 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.1297 + paddd xmm7, xmm1 1.1298 + 1.1299 + lea rsi, [rsi + rax] 1.1300 + lea rdi, [rdi + rdx] 1.1301 + 1.1302 + sub rcx, 1 ; 1.1303 + jnz vp8_half_horiz_variance16x_h_1 ; 1.1304 + 1.1305 + pxor xmm1, xmm1 1.1306 + pxor xmm5, xmm5 1.1307 + 1.1308 + punpcklwd xmm0, xmm6 1.1309 + punpckhwd xmm1, xmm6 1.1310 + psrad xmm0, 16 1.1311 + psrad xmm1, 16 1.1312 + paddd xmm0, xmm1 1.1313 + movdqa xmm1, xmm0 1.1314 + 1.1315 + movdqa xmm6, xmm7 1.1316 + punpckldq xmm6, xmm5 1.1317 + punpckhdq xmm7, xmm5 1.1318 + paddd xmm6, xmm7 1.1319 + 1.1320 + punpckldq xmm0, xmm5 1.1321 + punpckhdq xmm1, xmm5 1.1322 + paddd xmm0, xmm1 1.1323 + 1.1324 + movdqa xmm7, xmm6 1.1325 + movdqa xmm1, xmm0 1.1326 + 1.1327 + psrldq xmm7, 8 1.1328 + psrldq xmm1, 8 1.1329 + 1.1330 + paddd xmm6, xmm7 1.1331 + paddd xmm0, xmm1 1.1332 + 1.1333 + mov rsi, arg(5) ;[Sum] 1.1334 + mov rdi, arg(6) ;[SSE] 1.1335 + 1.1336 + movd [rsi], xmm0 1.1337 + movd [rdi], xmm6 1.1338 + 1.1339 + ; begin epilog 1.1340 + pop rdi 1.1341 + pop rsi 1.1342 + RESTORE_GOT 1.1343 + RESTORE_XMM 1.1344 + UNSHADOW_ARGS 1.1345 + pop rbp 1.1346 + ret 1.1347 + 1.1348 +SECTION_RODATA 1.1349 +; short xmm_bi_rd[8] = { 64, 64, 64, 64,64, 64, 64, 64}; 1.1350 +align 16 1.1351 +xmm_bi_rd: 1.1352 + times 8 dw 64 1.1353 +align 16 1.1354 +vp8_bilinear_filters_sse2: 1.1355 + dw 128, 128, 128, 128, 128, 128, 128, 128, 0, 0, 0, 0, 0, 0, 0, 0 1.1356 + dw 112, 112, 112, 112, 112, 112, 112, 112, 16, 16, 16, 16, 16, 16, 16, 16 1.1357 + dw 96, 96, 96, 96, 96, 96, 96, 96, 32, 32, 32, 32, 32, 32, 32, 32 1.1358 + dw 80, 80, 80, 80, 80, 80, 80, 80, 48, 48, 48, 48, 48, 48, 48, 48 1.1359 + dw 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64, 64 1.1360 + dw 48, 48, 48, 48, 48, 48, 48, 48, 80, 80, 80, 80, 80, 80, 80, 80 1.1361 + dw 32, 32, 32, 32, 32, 32, 32, 32, 96, 96, 96, 96, 96, 96, 96, 96 1.1362 + dw 16, 16, 16, 16, 16, 16, 16, 16, 112, 112, 112, 112, 112, 112, 112, 112