1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_variance_impl_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,734 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;unsigned int vp9_get_mb_ss_sse2 1.18 +;( 1.19 +; short *src_ptr 1.20 +;) 1.21 +global sym(vp9_get_mb_ss_sse2) PRIVATE 1.22 +sym(vp9_get_mb_ss_sse2): 1.23 + push rbp 1.24 + mov rbp, rsp 1.25 + SHADOW_ARGS_TO_STACK 1 1.26 + GET_GOT rbx 1.27 + push rsi 1.28 + push rdi 1.29 + sub rsp, 16 1.30 + ; end prolog 1.31 + 1.32 + 1.33 + mov rax, arg(0) ;[src_ptr] 1.34 + mov rcx, 8 1.35 + pxor xmm4, xmm4 1.36 + 1.37 +.NEXTROW: 1.38 + movdqa xmm0, [rax] 1.39 + movdqa xmm1, [rax+16] 1.40 + movdqa xmm2, [rax+32] 1.41 + movdqa xmm3, [rax+48] 1.42 + pmaddwd xmm0, xmm0 1.43 + pmaddwd xmm1, xmm1 1.44 + pmaddwd xmm2, xmm2 1.45 + pmaddwd xmm3, xmm3 1.46 + 1.47 + paddd xmm0, xmm1 1.48 + paddd xmm2, xmm3 1.49 + paddd xmm4, xmm0 1.50 + paddd xmm4, xmm2 1.51 + 1.52 + add rax, 0x40 1.53 + dec rcx 1.54 + ja .NEXTROW 1.55 + 1.56 + movdqa xmm3,xmm4 1.57 + psrldq xmm4,8 1.58 + paddd xmm4,xmm3 1.59 + movdqa xmm3,xmm4 1.60 + psrldq xmm4,4 1.61 + paddd xmm4,xmm3 1.62 + movq rax,xmm4 1.63 + 1.64 + 1.65 + ; begin epilog 1.66 + add rsp, 16 1.67 + pop rdi 1.68 + pop rsi 1.69 + RESTORE_GOT 1.70 + UNSHADOW_ARGS 1.71 + pop rbp 1.72 + ret 1.73 + 1.74 + 1.75 +;unsigned int vp9_get16x16var_sse2 1.76 +;( 1.77 +; unsigned char * src_ptr, 1.78 +; int source_stride, 1.79 +; unsigned char * ref_ptr, 1.80 +; int recon_stride, 1.81 +; unsigned int * SSE, 1.82 +; int * Sum 1.83 +;) 1.84 +global sym(vp9_get16x16var_sse2) PRIVATE 1.85 +sym(vp9_get16x16var_sse2): 1.86 + push rbp 1.87 + mov rbp, rsp 1.88 + SHADOW_ARGS_TO_STACK 6 1.89 + SAVE_XMM 7 1.90 + push rbx 1.91 + push rsi 1.92 + push rdi 1.93 + ; end prolog 1.94 + 1.95 + mov rsi, arg(0) ;[src_ptr] 1.96 + mov rdi, arg(2) ;[ref_ptr] 1.97 + 1.98 + movsxd rax, DWORD PTR arg(1) ;[source_stride] 1.99 + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 1.100 + 1.101 + ; Prefetch data 1.102 + lea rcx, [rax+rax*2] 1.103 + prefetcht0 [rsi] 1.104 + prefetcht0 [rsi+rax] 1.105 + prefetcht0 [rsi+rax*2] 1.106 + prefetcht0 [rsi+rcx] 1.107 + lea rbx, [rsi+rax*4] 1.108 + prefetcht0 [rbx] 1.109 + prefetcht0 [rbx+rax] 1.110 + prefetcht0 [rbx+rax*2] 1.111 + prefetcht0 [rbx+rcx] 1.112 + 1.113 + lea rcx, [rdx+rdx*2] 1.114 + prefetcht0 [rdi] 1.115 + prefetcht0 [rdi+rdx] 1.116 + prefetcht0 [rdi+rdx*2] 1.117 + prefetcht0 [rdi+rcx] 1.118 + lea rbx, [rdi+rdx*4] 1.119 + prefetcht0 [rbx] 1.120 + prefetcht0 [rbx+rdx] 1.121 + prefetcht0 [rbx+rdx*2] 1.122 + prefetcht0 [rbx+rcx] 1.123 + 1.124 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.125 + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 1.126 + 1.127 + pxor xmm6, xmm6 ; clear xmm6 for accumulating sse 1.128 + mov rcx, 16 1.129 + 1.130 +.var16loop: 1.131 + movdqu xmm1, XMMWORD PTR [rsi] 1.132 + movdqu xmm2, XMMWORD PTR [rdi] 1.133 + 1.134 + prefetcht0 [rsi+rax*8] 1.135 + prefetcht0 [rdi+rdx*8] 1.136 + 1.137 + movdqa xmm3, xmm1 1.138 + movdqa xmm4, xmm2 1.139 + 1.140 + 1.141 + punpcklbw xmm1, xmm0 1.142 + punpckhbw xmm3, xmm0 1.143 + 1.144 + punpcklbw xmm2, xmm0 1.145 + punpckhbw xmm4, xmm0 1.146 + 1.147 + 1.148 + psubw xmm1, xmm2 1.149 + psubw xmm3, xmm4 1.150 + 1.151 + paddw xmm7, xmm1 1.152 + pmaddwd xmm1, xmm1 1.153 + 1.154 + paddw xmm7, xmm3 1.155 + pmaddwd xmm3, xmm3 1.156 + 1.157 + paddd xmm6, xmm1 1.158 + paddd xmm6, xmm3 1.159 + 1.160 + add rsi, rax 1.161 + add rdi, rdx 1.162 + 1.163 + sub rcx, 1 1.164 + jnz .var16loop 1.165 + 1.166 + 1.167 + movdqa xmm1, xmm6 1.168 + pxor xmm6, xmm6 1.169 + 1.170 + pxor xmm5, xmm5 1.171 + punpcklwd xmm6, xmm7 1.172 + 1.173 + punpckhwd xmm5, xmm7 1.174 + psrad xmm5, 16 1.175 + 1.176 + psrad xmm6, 16 1.177 + paddd xmm6, xmm5 1.178 + 1.179 + movdqa xmm2, xmm1 1.180 + punpckldq xmm1, xmm0 1.181 + 1.182 + punpckhdq xmm2, xmm0 1.183 + movdqa xmm7, xmm6 1.184 + 1.185 + paddd xmm1, xmm2 1.186 + punpckldq xmm6, xmm0 1.187 + 1.188 + punpckhdq xmm7, xmm0 1.189 + paddd xmm6, xmm7 1.190 + 1.191 + movdqa xmm2, xmm1 1.192 + movdqa xmm7, xmm6 1.193 + 1.194 + psrldq xmm1, 8 1.195 + psrldq xmm6, 8 1.196 + 1.197 + paddd xmm7, xmm6 1.198 + paddd xmm1, xmm2 1.199 + 1.200 + mov rax, arg(5) ;[Sum] 1.201 + mov rdi, arg(4) ;[SSE] 1.202 + 1.203 + movd DWORD PTR [rax], xmm7 1.204 + movd DWORD PTR [rdi], xmm1 1.205 + 1.206 + 1.207 + ; begin epilog 1.208 + pop rdi 1.209 + pop rsi 1.210 + pop rbx 1.211 + RESTORE_XMM 1.212 + UNSHADOW_ARGS 1.213 + pop rbp 1.214 + ret 1.215 + 1.216 + 1.217 + 1.218 + 1.219 +;unsigned int vp9_get8x8var_sse2 1.220 +;( 1.221 +; unsigned char * src_ptr, 1.222 +; int source_stride, 1.223 +; unsigned char * ref_ptr, 1.224 +; int recon_stride, 1.225 +; unsigned int * SSE, 1.226 +; int * Sum 1.227 +;) 1.228 +global sym(vp9_get8x8var_sse2) PRIVATE 1.229 +sym(vp9_get8x8var_sse2): 1.230 + push rbp 1.231 + mov rbp, rsp 1.232 + SHADOW_ARGS_TO_STACK 6 1.233 + SAVE_XMM 7 1.234 + GET_GOT rbx 1.235 + push rsi 1.236 + push rdi 1.237 + sub rsp, 16 1.238 + ; end prolog 1.239 + 1.240 + mov rsi, arg(0) ;[src_ptr] 1.241 + mov rdi, arg(2) ;[ref_ptr] 1.242 + 1.243 + movsxd rax, DWORD PTR arg(1) ;[source_stride] 1.244 + movsxd rdx, DWORD PTR arg(3) ;[recon_stride] 1.245 + 1.246 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.247 + pxor xmm7, xmm7 ; clear xmm7 for accumulating diffs 1.248 + 1.249 + movq xmm1, QWORD PTR [rsi] 1.250 + movq xmm2, QWORD PTR [rdi] 1.251 + 1.252 + punpcklbw xmm1, xmm0 1.253 + punpcklbw xmm2, xmm0 1.254 + 1.255 + psubsw xmm1, xmm2 1.256 + paddw xmm7, xmm1 1.257 + 1.258 + pmaddwd xmm1, xmm1 1.259 + 1.260 + movq xmm2, QWORD PTR[rsi + rax] 1.261 + movq xmm3, QWORD PTR[rdi + rdx] 1.262 + 1.263 + punpcklbw xmm2, xmm0 1.264 + punpcklbw xmm3, xmm0 1.265 + 1.266 + psubsw xmm2, xmm3 1.267 + paddw xmm7, xmm2 1.268 + 1.269 + pmaddwd xmm2, xmm2 1.270 + paddd xmm1, xmm2 1.271 + 1.272 + 1.273 + movq xmm2, QWORD PTR[rsi + rax * 2] 1.274 + movq xmm3, QWORD PTR[rdi + rdx * 2] 1.275 + 1.276 + punpcklbw xmm2, xmm0 1.277 + punpcklbw xmm3, xmm0 1.278 + 1.279 + psubsw xmm2, xmm3 1.280 + paddw xmm7, xmm2 1.281 + 1.282 + pmaddwd xmm2, xmm2 1.283 + paddd xmm1, xmm2 1.284 + 1.285 + 1.286 + lea rsi, [rsi + rax * 2] 1.287 + lea rdi, [rdi + rdx * 2] 1.288 + movq xmm2, QWORD PTR[rsi + rax] 1.289 + movq xmm3, QWORD PTR[rdi + rdx] 1.290 + 1.291 + punpcklbw xmm2, xmm0 1.292 + punpcklbw xmm3, xmm0 1.293 + 1.294 + psubsw xmm2, xmm3 1.295 + paddw xmm7, xmm2 1.296 + 1.297 + pmaddwd xmm2, xmm2 1.298 + paddd xmm1, xmm2 1.299 + 1.300 + movq xmm2, QWORD PTR[rsi + rax *2] 1.301 + movq xmm3, QWORD PTR[rdi + rdx *2] 1.302 + 1.303 + punpcklbw xmm2, xmm0 1.304 + punpcklbw xmm3, xmm0 1.305 + 1.306 + psubsw xmm2, xmm3 1.307 + paddw xmm7, xmm2 1.308 + 1.309 + pmaddwd xmm2, xmm2 1.310 + paddd xmm1, xmm2 1.311 + 1.312 + 1.313 + lea rsi, [rsi + rax * 2] 1.314 + lea rdi, [rdi + rdx * 2] 1.315 + 1.316 + 1.317 + movq xmm2, QWORD PTR[rsi + rax] 1.318 + movq xmm3, QWORD PTR[rdi + rdx] 1.319 + 1.320 + punpcklbw xmm2, xmm0 1.321 + punpcklbw xmm3, xmm0 1.322 + 1.323 + psubsw xmm2, xmm3 1.324 + paddw xmm7, xmm2 1.325 + 1.326 + pmaddwd xmm2, xmm2 1.327 + paddd xmm1, xmm2 1.328 + 1.329 + movq xmm2, QWORD PTR[rsi + rax *2] 1.330 + movq xmm3, QWORD PTR[rdi + rdx *2] 1.331 + 1.332 + punpcklbw xmm2, xmm0 1.333 + punpcklbw xmm3, xmm0 1.334 + 1.335 + psubsw xmm2, xmm3 1.336 + paddw xmm7, xmm2 1.337 + 1.338 + pmaddwd xmm2, xmm2 1.339 + paddd xmm1, xmm2 1.340 + 1.341 + 1.342 + lea rsi, [rsi + rax * 2] 1.343 + lea rdi, [rdi + rdx * 2] 1.344 + 1.345 + movq xmm2, QWORD PTR[rsi + rax] 1.346 + movq xmm3, QWORD PTR[rdi + rdx] 1.347 + 1.348 + punpcklbw xmm2, xmm0 1.349 + punpcklbw xmm3, xmm0 1.350 + 1.351 + psubsw xmm2, xmm3 1.352 + paddw xmm7, xmm2 1.353 + 1.354 + pmaddwd xmm2, xmm2 1.355 + paddd xmm1, xmm2 1.356 + 1.357 + 1.358 + movdqa xmm6, xmm7 1.359 + punpcklwd xmm6, xmm0 1.360 + 1.361 + punpckhwd xmm7, xmm0 1.362 + movdqa xmm2, xmm1 1.363 + 1.364 + paddw xmm6, xmm7 1.365 + punpckldq xmm1, xmm0 1.366 + 1.367 + punpckhdq xmm2, xmm0 1.368 + movdqa xmm7, xmm6 1.369 + 1.370 + paddd xmm1, xmm2 1.371 + punpckldq xmm6, xmm0 1.372 + 1.373 + punpckhdq xmm7, xmm0 1.374 + paddw xmm6, xmm7 1.375 + 1.376 + movdqa xmm2, xmm1 1.377 + movdqa xmm7, xmm6 1.378 + 1.379 + psrldq xmm1, 8 1.380 + psrldq xmm6, 8 1.381 + 1.382 + paddw xmm7, xmm6 1.383 + paddd xmm1, xmm2 1.384 + 1.385 + mov rax, arg(5) ;[Sum] 1.386 + mov rdi, arg(4) ;[SSE] 1.387 + 1.388 + movq rdx, xmm7 1.389 + movsx rcx, dx 1.390 + 1.391 + mov dword ptr [rax], ecx 1.392 + movd DWORD PTR [rdi], xmm1 1.393 + 1.394 + ; begin epilog 1.395 + add rsp, 16 1.396 + pop rdi 1.397 + pop rsi 1.398 + RESTORE_GOT 1.399 + RESTORE_XMM 1.400 + UNSHADOW_ARGS 1.401 + pop rbp 1.402 + ret 1.403 + 1.404 +;void vp9_half_horiz_vert_variance8x_h_sse2 1.405 +;( 1.406 +; unsigned char *ref_ptr, 1.407 +; int ref_pixels_per_line, 1.408 +; unsigned char *src_ptr, 1.409 +; int src_pixels_per_line, 1.410 +; unsigned int Height, 1.411 +; int *sum, 1.412 +; unsigned int *sumsquared 1.413 +;) 1.414 +global sym(vp9_half_horiz_vert_variance8x_h_sse2) PRIVATE 1.415 +sym(vp9_half_horiz_vert_variance8x_h_sse2): 1.416 + push rbp 1.417 + mov rbp, rsp 1.418 + SHADOW_ARGS_TO_STACK 7 1.419 + SAVE_XMM 7 1.420 + GET_GOT rbx 1.421 + push rsi 1.422 + push rdi 1.423 + ; end prolog 1.424 + 1.425 +%if ABI_IS_32BIT=0 1.426 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.427 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.428 +%endif 1.429 + 1.430 + pxor xmm6, xmm6 ; error accumulator 1.431 + pxor xmm7, xmm7 ; sse eaccumulator 1.432 + mov rsi, arg(0) ;ref_ptr ; 1.433 + 1.434 + mov rdi, arg(2) ;src_ptr ; 1.435 + movsxd rcx, dword ptr arg(4) ;Height ; 1.436 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.437 + 1.438 + pxor xmm0, xmm0 ; 1.439 + 1.440 + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1.441 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 1.442 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 1.443 + 1.444 +%if ABI_IS_32BIT 1.445 + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.446 +%else 1.447 + add rsi, r8 1.448 +%endif 1.449 + 1.450 +.half_horiz_vert_variance8x_h_1: 1.451 + 1.452 + movq xmm1, QWORD PTR [rsi] ; 1.453 + movq xmm2, QWORD PTR [rsi+1] ; 1.454 + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 1.455 + 1.456 + pavgb xmm5, xmm1 ; xmm = vertical average of the above 1.457 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.458 + 1.459 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1.460 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.461 + 1.462 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.463 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.464 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.465 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.466 + 1.467 + movdqa xmm5, xmm1 ; save xmm1 for use on the next row 1.468 + 1.469 +%if ABI_IS_32BIT 1.470 + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.471 + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1.472 +%else 1.473 + add rsi, r8 1.474 + add rdi, r9 1.475 +%endif 1.476 + 1.477 + sub rcx, 1 ; 1.478 + jnz .half_horiz_vert_variance8x_h_1 ; 1.479 + 1.480 + movdq2q mm6, xmm6 ; 1.481 + movdq2q mm7, xmm7 ; 1.482 + 1.483 + psrldq xmm6, 8 1.484 + psrldq xmm7, 8 1.485 + 1.486 + movdq2q mm2, xmm6 1.487 + movdq2q mm3, xmm7 1.488 + 1.489 + paddw mm6, mm2 1.490 + paddd mm7, mm3 1.491 + 1.492 + pxor mm3, mm3 ; 1.493 + pxor mm2, mm2 ; 1.494 + 1.495 + punpcklwd mm2, mm6 ; 1.496 + punpckhwd mm3, mm6 ; 1.497 + 1.498 + paddd mm2, mm3 ; 1.499 + movq mm6, mm2 ; 1.500 + 1.501 + psrlq mm6, 32 ; 1.502 + paddd mm2, mm6 ; 1.503 + 1.504 + psrad mm2, 16 ; 1.505 + movq mm4, mm7 ; 1.506 + 1.507 + psrlq mm4, 32 ; 1.508 + paddd mm4, mm7 ; 1.509 + 1.510 + mov rsi, arg(5) ; sum 1.511 + mov rdi, arg(6) ; sumsquared 1.512 + 1.513 + movd [rsi], mm2 ; 1.514 + movd [rdi], mm4 ; 1.515 + 1.516 + 1.517 + ; begin epilog 1.518 + pop rdi 1.519 + pop rsi 1.520 + RESTORE_GOT 1.521 + RESTORE_XMM 1.522 + UNSHADOW_ARGS 1.523 + pop rbp 1.524 + ret 1.525 + 1.526 +;void vp9_half_vert_variance8x_h_sse2 1.527 +;( 1.528 +; unsigned char *ref_ptr, 1.529 +; int ref_pixels_per_line, 1.530 +; unsigned char *src_ptr, 1.531 +; int src_pixels_per_line, 1.532 +; unsigned int Height, 1.533 +; int *sum, 1.534 +; unsigned int *sumsquared 1.535 +;) 1.536 +global sym(vp9_half_vert_variance8x_h_sse2) PRIVATE 1.537 +sym(vp9_half_vert_variance8x_h_sse2): 1.538 + push rbp 1.539 + mov rbp, rsp 1.540 + SHADOW_ARGS_TO_STACK 7 1.541 + SAVE_XMM 7 1.542 + GET_GOT rbx 1.543 + push rsi 1.544 + push rdi 1.545 + ; end prolog 1.546 + 1.547 +%if ABI_IS_32BIT=0 1.548 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.549 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.550 +%endif 1.551 + 1.552 + pxor xmm6, xmm6 ; error accumulator 1.553 + pxor xmm7, xmm7 ; sse eaccumulator 1.554 + mov rsi, arg(0) ;ref_ptr ; 1.555 + 1.556 + mov rdi, arg(2) ;src_ptr ; 1.557 + movsxd rcx, dword ptr arg(4) ;Height ; 1.558 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.559 + 1.560 + pxor xmm0, xmm0 ; 1.561 +.half_vert_variance8x_h_1: 1.562 + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1.563 + movq xmm3, QWORD PTR [rsi+rax] ; xmm3 = s1,s2,s3..s9 1.564 + 1.565 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.566 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.567 + 1.568 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1.569 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.570 + 1.571 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.572 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.573 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.574 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.575 + 1.576 +%if ABI_IS_32BIT 1.577 + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.578 + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1.579 +%else 1.580 + add rsi, r8 1.581 + add rdi, r9 1.582 +%endif 1.583 + 1.584 + sub rcx, 1 ; 1.585 + jnz .half_vert_variance8x_h_1 ; 1.586 + 1.587 + movdq2q mm6, xmm6 ; 1.588 + movdq2q mm7, xmm7 ; 1.589 + 1.590 + psrldq xmm6, 8 1.591 + psrldq xmm7, 8 1.592 + 1.593 + movdq2q mm2, xmm6 1.594 + movdq2q mm3, xmm7 1.595 + 1.596 + paddw mm6, mm2 1.597 + paddd mm7, mm3 1.598 + 1.599 + pxor mm3, mm3 ; 1.600 + pxor mm2, mm2 ; 1.601 + 1.602 + punpcklwd mm2, mm6 ; 1.603 + punpckhwd mm3, mm6 ; 1.604 + 1.605 + paddd mm2, mm3 ; 1.606 + movq mm6, mm2 ; 1.607 + 1.608 + psrlq mm6, 32 ; 1.609 + paddd mm2, mm6 ; 1.610 + 1.611 + psrad mm2, 16 ; 1.612 + movq mm4, mm7 ; 1.613 + 1.614 + psrlq mm4, 32 ; 1.615 + paddd mm4, mm7 ; 1.616 + 1.617 + mov rsi, arg(5) ; sum 1.618 + mov rdi, arg(6) ; sumsquared 1.619 + 1.620 + movd [rsi], mm2 ; 1.621 + movd [rdi], mm4 ; 1.622 + 1.623 + 1.624 + ; begin epilog 1.625 + pop rdi 1.626 + pop rsi 1.627 + RESTORE_GOT 1.628 + RESTORE_XMM 1.629 + UNSHADOW_ARGS 1.630 + pop rbp 1.631 + ret 1.632 + 1.633 + 1.634 +;void vp9_half_horiz_variance8x_h_sse2 1.635 +;( 1.636 +; unsigned char *ref_ptr, 1.637 +; int ref_pixels_per_line, 1.638 +; unsigned char *src_ptr, 1.639 +; int src_pixels_per_line, 1.640 +; unsigned int Height, 1.641 +; int *sum, 1.642 +; unsigned int *sumsquared 1.643 +;) 1.644 +global sym(vp9_half_horiz_variance8x_h_sse2) PRIVATE 1.645 +sym(vp9_half_horiz_variance8x_h_sse2): 1.646 + push rbp 1.647 + mov rbp, rsp 1.648 + SHADOW_ARGS_TO_STACK 7 1.649 + SAVE_XMM 7 1.650 + GET_GOT rbx 1.651 + push rsi 1.652 + push rdi 1.653 + ; end prolog 1.654 + 1.655 +%if ABI_IS_32BIT=0 1.656 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.657 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.658 +%endif 1.659 + 1.660 + pxor xmm6, xmm6 ; error accumulator 1.661 + pxor xmm7, xmm7 ; sse eaccumulator 1.662 + mov rsi, arg(0) ;ref_ptr ; 1.663 + 1.664 + mov rdi, arg(2) ;src_ptr ; 1.665 + movsxd rcx, dword ptr arg(4) ;Height ; 1.666 + 1.667 + pxor xmm0, xmm0 ; 1.668 +.half_horiz_variance8x_h_1: 1.669 + movq xmm5, QWORD PTR [rsi] ; xmm5 = s0,s1,s2..s8 1.670 + movq xmm3, QWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s9 1.671 + 1.672 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.673 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.674 + 1.675 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d8 1.676 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.677 + 1.678 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.679 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.680 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.681 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.682 + 1.683 +%if ABI_IS_32BIT 1.684 + add esi, dword ptr arg(1) ;ref_pixels_per_line ; next source 1.685 + add edi, dword ptr arg(3) ;src_pixels_per_line ; next destination 1.686 +%else 1.687 + add rsi, r8 1.688 + add rdi, r9 1.689 +%endif 1.690 + sub rcx, 1 ; 1.691 + jnz .half_horiz_variance8x_h_1 ; 1.692 + 1.693 + movdq2q mm6, xmm6 ; 1.694 + movdq2q mm7, xmm7 ; 1.695 + 1.696 + psrldq xmm6, 8 1.697 + psrldq xmm7, 8 1.698 + 1.699 + movdq2q mm2, xmm6 1.700 + movdq2q mm3, xmm7 1.701 + 1.702 + paddw mm6, mm2 1.703 + paddd mm7, mm3 1.704 + 1.705 + pxor mm3, mm3 ; 1.706 + pxor mm2, mm2 ; 1.707 + 1.708 + punpcklwd mm2, mm6 ; 1.709 + punpckhwd mm3, mm6 ; 1.710 + 1.711 + paddd mm2, mm3 ; 1.712 + movq mm6, mm2 ; 1.713 + 1.714 + psrlq mm6, 32 ; 1.715 + paddd mm2, mm6 ; 1.716 + 1.717 + psrad mm2, 16 ; 1.718 + movq mm4, mm7 ; 1.719 + 1.720 + psrlq mm4, 32 ; 1.721 + paddd mm4, mm7 ; 1.722 + 1.723 + mov rsi, arg(5) ; sum 1.724 + mov rdi, arg(6) ; sumsquared 1.725 + 1.726 + movd [rsi], mm2 ; 1.727 + movd [rdi], mm4 ; 1.728 + 1.729 + 1.730 + ; begin epilog 1.731 + pop rdi 1.732 + pop rsi 1.733 + RESTORE_GOT 1.734 + RESTORE_XMM 1.735 + UNSHADOW_ARGS 1.736 + pop rbp 1.737 + ret