1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/sad_sse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,960 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "vpx_ports/x86_abi_support.asm" 1.15 + 1.16 +%macro STACK_FRAME_CREATE_X3 0 1.17 +%if ABI_IS_32BIT 1.18 + %define src_ptr rsi 1.19 + %define src_stride rax 1.20 + %define ref_ptr rdi 1.21 + %define ref_stride rdx 1.22 + %define end_ptr rcx 1.23 + %define ret_var rbx 1.24 + %define result_ptr arg(4) 1.25 + %define max_sad arg(4) 1.26 + %define height dword ptr arg(4) 1.27 + push rbp 1.28 + mov rbp, rsp 1.29 + push rsi 1.30 + push rdi 1.31 + push rbx 1.32 + 1.33 + mov rsi, arg(0) ; src_ptr 1.34 + mov rdi, arg(2) ; ref_ptr 1.35 + 1.36 + movsxd rax, dword ptr arg(1) ; src_stride 1.37 + movsxd rdx, dword ptr arg(3) ; ref_stride 1.38 +%else 1.39 + %if LIBVPX_YASM_WIN64 1.40 + SAVE_XMM 7, u 1.41 + %define src_ptr rcx 1.42 + %define src_stride rdx 1.43 + %define ref_ptr r8 1.44 + %define ref_stride r9 1.45 + %define end_ptr r10 1.46 + %define ret_var r11 1.47 + %define result_ptr [rsp+xmm_stack_space+8+4*8] 1.48 + %define max_sad [rsp+xmm_stack_space+8+4*8] 1.49 + %define height dword ptr [rsp+xmm_stack_space+8+4*8] 1.50 + %else 1.51 + %define src_ptr rdi 1.52 + %define src_stride rsi 1.53 + %define ref_ptr rdx 1.54 + %define ref_stride rcx 1.55 + %define end_ptr r9 1.56 + %define ret_var r10 1.57 + %define result_ptr r8 1.58 + %define max_sad r8 1.59 + %define height r8 1.60 + %endif 1.61 +%endif 1.62 + 1.63 +%endmacro 1.64 + 1.65 +%macro STACK_FRAME_DESTROY_X3 0 1.66 + %define src_ptr 1.67 + %define src_stride 1.68 + %define ref_ptr 1.69 + %define ref_stride 1.70 + %define end_ptr 1.71 + %define ret_var 1.72 + %define result_ptr 1.73 + %define max_sad 1.74 + %define height 1.75 + 1.76 +%if ABI_IS_32BIT 1.77 + pop rbx 1.78 + pop rdi 1.79 + pop rsi 1.80 + pop rbp 1.81 +%else 1.82 + %if LIBVPX_YASM_WIN64 1.83 + RESTORE_XMM 1.84 + %endif 1.85 +%endif 1.86 + ret 1.87 +%endmacro 1.88 + 1.89 +%macro STACK_FRAME_CREATE_X4 0 1.90 +%if ABI_IS_32BIT 1.91 + %define src_ptr rsi 1.92 + %define src_stride rax 1.93 + %define r0_ptr rcx 1.94 + %define r1_ptr rdx 1.95 + %define r2_ptr rbx 1.96 + %define r3_ptr rdi 1.97 + %define ref_stride rbp 1.98 + %define result_ptr arg(4) 1.99 + push rbp 1.100 + mov rbp, rsp 1.101 + push rsi 1.102 + push rdi 1.103 + push rbx 1.104 + 1.105 + push rbp 1.106 + mov rdi, arg(2) ; ref_ptr_base 1.107 + 1.108 + LOAD_X4_ADDRESSES rdi, rcx, rdx, rax, rdi 1.109 + 1.110 + mov rsi, arg(0) ; src_ptr 1.111 + 1.112 + movsxd rbx, dword ptr arg(1) ; src_stride 1.113 + movsxd rbp, dword ptr arg(3) ; ref_stride 1.114 + 1.115 + xchg rbx, rax 1.116 +%else 1.117 + %if LIBVPX_YASM_WIN64 1.118 + SAVE_XMM 7, u 1.119 + %define src_ptr rcx 1.120 + %define src_stride rdx 1.121 + %define r0_ptr rsi 1.122 + %define r1_ptr r10 1.123 + %define r2_ptr r11 1.124 + %define r3_ptr r8 1.125 + %define ref_stride r9 1.126 + %define result_ptr [rsp+xmm_stack_space+16+4*8] 1.127 + push rsi 1.128 + 1.129 + LOAD_X4_ADDRESSES r8, r0_ptr, r1_ptr, r2_ptr, r3_ptr 1.130 + %else 1.131 + %define src_ptr rdi 1.132 + %define src_stride rsi 1.133 + %define r0_ptr r9 1.134 + %define r1_ptr r10 1.135 + %define r2_ptr r11 1.136 + %define r3_ptr rdx 1.137 + %define ref_stride rcx 1.138 + %define result_ptr r8 1.139 + 1.140 + LOAD_X4_ADDRESSES rdx, r0_ptr, r1_ptr, r2_ptr, r3_ptr 1.141 + 1.142 + %endif 1.143 +%endif 1.144 +%endmacro 1.145 + 1.146 +%macro STACK_FRAME_DESTROY_X4 0 1.147 + %define src_ptr 1.148 + %define src_stride 1.149 + %define r0_ptr 1.150 + %define r1_ptr 1.151 + %define r2_ptr 1.152 + %define r3_ptr 1.153 + %define ref_stride 1.154 + %define result_ptr 1.155 + 1.156 +%if ABI_IS_32BIT 1.157 + pop rbx 1.158 + pop rdi 1.159 + pop rsi 1.160 + pop rbp 1.161 +%else 1.162 + %if LIBVPX_YASM_WIN64 1.163 + pop rsi 1.164 + RESTORE_XMM 1.165 + %endif 1.166 +%endif 1.167 + ret 1.168 +%endmacro 1.169 + 1.170 +%macro PROCESS_16X2X3 5 1.171 +%if %1==0 1.172 + movdqa xmm0, XMMWORD PTR [%2] 1.173 + lddqu xmm5, XMMWORD PTR [%3] 1.174 + lddqu xmm6, XMMWORD PTR [%3+1] 1.175 + lddqu xmm7, XMMWORD PTR [%3+2] 1.176 + 1.177 + psadbw xmm5, xmm0 1.178 + psadbw xmm6, xmm0 1.179 + psadbw xmm7, xmm0 1.180 +%else 1.181 + movdqa xmm0, XMMWORD PTR [%2] 1.182 + lddqu xmm1, XMMWORD PTR [%3] 1.183 + lddqu xmm2, XMMWORD PTR [%3+1] 1.184 + lddqu xmm3, XMMWORD PTR [%3+2] 1.185 + 1.186 + psadbw xmm1, xmm0 1.187 + psadbw xmm2, xmm0 1.188 + psadbw xmm3, xmm0 1.189 + 1.190 + paddw xmm5, xmm1 1.191 + paddw xmm6, xmm2 1.192 + paddw xmm7, xmm3 1.193 +%endif 1.194 + movdqa xmm0, XMMWORD PTR [%2+%4] 1.195 + lddqu xmm1, XMMWORD PTR [%3+%5] 1.196 + lddqu xmm2, XMMWORD PTR [%3+%5+1] 1.197 + lddqu xmm3, XMMWORD PTR [%3+%5+2] 1.198 + 1.199 +%if %1==0 || %1==1 1.200 + lea %2, [%2+%4*2] 1.201 + lea %3, [%3+%5*2] 1.202 +%endif 1.203 + 1.204 + psadbw xmm1, xmm0 1.205 + psadbw xmm2, xmm0 1.206 + psadbw xmm3, xmm0 1.207 + 1.208 + paddw xmm5, xmm1 1.209 + paddw xmm6, xmm2 1.210 + paddw xmm7, xmm3 1.211 +%endmacro 1.212 + 1.213 +%macro PROCESS_8X2X3 5 1.214 +%if %1==0 1.215 + movq mm0, QWORD PTR [%2] 1.216 + movq mm5, QWORD PTR [%3] 1.217 + movq mm6, QWORD PTR [%3+1] 1.218 + movq mm7, QWORD PTR [%3+2] 1.219 + 1.220 + psadbw mm5, mm0 1.221 + psadbw mm6, mm0 1.222 + psadbw mm7, mm0 1.223 +%else 1.224 + movq mm0, QWORD PTR [%2] 1.225 + movq mm1, QWORD PTR [%3] 1.226 + movq mm2, QWORD PTR [%3+1] 1.227 + movq mm3, QWORD PTR [%3+2] 1.228 + 1.229 + psadbw mm1, mm0 1.230 + psadbw mm2, mm0 1.231 + psadbw mm3, mm0 1.232 + 1.233 + paddw mm5, mm1 1.234 + paddw mm6, mm2 1.235 + paddw mm7, mm3 1.236 +%endif 1.237 + movq mm0, QWORD PTR [%2+%4] 1.238 + movq mm1, QWORD PTR [%3+%5] 1.239 + movq mm2, QWORD PTR [%3+%5+1] 1.240 + movq mm3, QWORD PTR [%3+%5+2] 1.241 + 1.242 +%if %1==0 || %1==1 1.243 + lea %2, [%2+%4*2] 1.244 + lea %3, [%3+%5*2] 1.245 +%endif 1.246 + 1.247 + psadbw mm1, mm0 1.248 + psadbw mm2, mm0 1.249 + psadbw mm3, mm0 1.250 + 1.251 + paddw mm5, mm1 1.252 + paddw mm6, mm2 1.253 + paddw mm7, mm3 1.254 +%endmacro 1.255 + 1.256 +%macro LOAD_X4_ADDRESSES 5 1.257 + mov %2, [%1+REG_SZ_BYTES*0] 1.258 + mov %3, [%1+REG_SZ_BYTES*1] 1.259 + 1.260 + mov %4, [%1+REG_SZ_BYTES*2] 1.261 + mov %5, [%1+REG_SZ_BYTES*3] 1.262 +%endmacro 1.263 + 1.264 +%macro PROCESS_16X2X4 8 1.265 +%if %1==0 1.266 + movdqa xmm0, XMMWORD PTR [%2] 1.267 + lddqu xmm4, XMMWORD PTR [%3] 1.268 + lddqu xmm5, XMMWORD PTR [%4] 1.269 + lddqu xmm6, XMMWORD PTR [%5] 1.270 + lddqu xmm7, XMMWORD PTR [%6] 1.271 + 1.272 + psadbw xmm4, xmm0 1.273 + psadbw xmm5, xmm0 1.274 + psadbw xmm6, xmm0 1.275 + psadbw xmm7, xmm0 1.276 +%else 1.277 + movdqa xmm0, XMMWORD PTR [%2] 1.278 + lddqu xmm1, XMMWORD PTR [%3] 1.279 + lddqu xmm2, XMMWORD PTR [%4] 1.280 + lddqu xmm3, XMMWORD PTR [%5] 1.281 + 1.282 + psadbw xmm1, xmm0 1.283 + psadbw xmm2, xmm0 1.284 + psadbw xmm3, xmm0 1.285 + 1.286 + paddw xmm4, xmm1 1.287 + lddqu xmm1, XMMWORD PTR [%6] 1.288 + paddw xmm5, xmm2 1.289 + paddw xmm6, xmm3 1.290 + 1.291 + psadbw xmm1, xmm0 1.292 + paddw xmm7, xmm1 1.293 +%endif 1.294 + movdqa xmm0, XMMWORD PTR [%2+%7] 1.295 + lddqu xmm1, XMMWORD PTR [%3+%8] 1.296 + lddqu xmm2, XMMWORD PTR [%4+%8] 1.297 + lddqu xmm3, XMMWORD PTR [%5+%8] 1.298 + 1.299 + psadbw xmm1, xmm0 1.300 + psadbw xmm2, xmm0 1.301 + psadbw xmm3, xmm0 1.302 + 1.303 + paddw xmm4, xmm1 1.304 + lddqu xmm1, XMMWORD PTR [%6+%8] 1.305 + paddw xmm5, xmm2 1.306 + paddw xmm6, xmm3 1.307 + 1.308 +%if %1==0 || %1==1 1.309 + lea %2, [%2+%7*2] 1.310 + lea %3, [%3+%8*2] 1.311 + 1.312 + lea %4, [%4+%8*2] 1.313 + lea %5, [%5+%8*2] 1.314 + 1.315 + lea %6, [%6+%8*2] 1.316 +%endif 1.317 + psadbw xmm1, xmm0 1.318 + paddw xmm7, xmm1 1.319 + 1.320 +%endmacro 1.321 + 1.322 +%macro PROCESS_8X2X4 8 1.323 +%if %1==0 1.324 + movq mm0, QWORD PTR [%2] 1.325 + movq mm4, QWORD PTR [%3] 1.326 + movq mm5, QWORD PTR [%4] 1.327 + movq mm6, QWORD PTR [%5] 1.328 + movq mm7, QWORD PTR [%6] 1.329 + 1.330 + psadbw mm4, mm0 1.331 + psadbw mm5, mm0 1.332 + psadbw mm6, mm0 1.333 + psadbw mm7, mm0 1.334 +%else 1.335 + movq mm0, QWORD PTR [%2] 1.336 + movq mm1, QWORD PTR [%3] 1.337 + movq mm2, QWORD PTR [%4] 1.338 + movq mm3, QWORD PTR [%5] 1.339 + 1.340 + psadbw mm1, mm0 1.341 + psadbw mm2, mm0 1.342 + psadbw mm3, mm0 1.343 + 1.344 + paddw mm4, mm1 1.345 + movq mm1, QWORD PTR [%6] 1.346 + paddw mm5, mm2 1.347 + paddw mm6, mm3 1.348 + 1.349 + psadbw mm1, mm0 1.350 + paddw mm7, mm1 1.351 +%endif 1.352 + movq mm0, QWORD PTR [%2+%7] 1.353 + movq mm1, QWORD PTR [%3+%8] 1.354 + movq mm2, QWORD PTR [%4+%8] 1.355 + movq mm3, QWORD PTR [%5+%8] 1.356 + 1.357 + psadbw mm1, mm0 1.358 + psadbw mm2, mm0 1.359 + psadbw mm3, mm0 1.360 + 1.361 + paddw mm4, mm1 1.362 + movq mm1, QWORD PTR [%6+%8] 1.363 + paddw mm5, mm2 1.364 + paddw mm6, mm3 1.365 + 1.366 +%if %1==0 || %1==1 1.367 + lea %2, [%2+%7*2] 1.368 + lea %3, [%3+%8*2] 1.369 + 1.370 + lea %4, [%4+%8*2] 1.371 + lea %5, [%5+%8*2] 1.372 + 1.373 + lea %6, [%6+%8*2] 1.374 +%endif 1.375 + psadbw mm1, mm0 1.376 + paddw mm7, mm1 1.377 + 1.378 +%endmacro 1.379 + 1.380 +;void int vp8_sad16x16x3_sse3( 1.381 +; unsigned char *src_ptr, 1.382 +; int src_stride, 1.383 +; unsigned char *ref_ptr, 1.384 +; int ref_stride, 1.385 +; int *results) 1.386 +global sym(vp8_sad16x16x3_sse3) PRIVATE 1.387 +sym(vp8_sad16x16x3_sse3): 1.388 + 1.389 + STACK_FRAME_CREATE_X3 1.390 + 1.391 + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.392 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.393 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.394 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.395 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.396 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.397 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.398 + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.399 + 1.400 + mov rcx, result_ptr 1.401 + 1.402 + movq xmm0, xmm5 1.403 + psrldq xmm5, 8 1.404 + 1.405 + paddw xmm0, xmm5 1.406 + movd [rcx], xmm0 1.407 +;- 1.408 + movq xmm0, xmm6 1.409 + psrldq xmm6, 8 1.410 + 1.411 + paddw xmm0, xmm6 1.412 + movd [rcx+4], xmm0 1.413 +;- 1.414 + movq xmm0, xmm7 1.415 + psrldq xmm7, 8 1.416 + 1.417 + paddw xmm0, xmm7 1.418 + movd [rcx+8], xmm0 1.419 + 1.420 + STACK_FRAME_DESTROY_X3 1.421 + 1.422 +;void int vp8_sad16x8x3_sse3( 1.423 +; unsigned char *src_ptr, 1.424 +; int src_stride, 1.425 +; unsigned char *ref_ptr, 1.426 +; int ref_stride, 1.427 +; int *results) 1.428 +global sym(vp8_sad16x8x3_sse3) PRIVATE 1.429 +sym(vp8_sad16x8x3_sse3): 1.430 + 1.431 + STACK_FRAME_CREATE_X3 1.432 + 1.433 + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.434 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.435 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.436 + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.437 + 1.438 + mov rcx, result_ptr 1.439 + 1.440 + movq xmm0, xmm5 1.441 + psrldq xmm5, 8 1.442 + 1.443 + paddw xmm0, xmm5 1.444 + movd [rcx], xmm0 1.445 +;- 1.446 + movq xmm0, xmm6 1.447 + psrldq xmm6, 8 1.448 + 1.449 + paddw xmm0, xmm6 1.450 + movd [rcx+4], xmm0 1.451 +;- 1.452 + movq xmm0, xmm7 1.453 + psrldq xmm7, 8 1.454 + 1.455 + paddw xmm0, xmm7 1.456 + movd [rcx+8], xmm0 1.457 + 1.458 + STACK_FRAME_DESTROY_X3 1.459 + 1.460 +;void int vp8_sad8x16x3_sse3( 1.461 +; unsigned char *src_ptr, 1.462 +; int src_stride, 1.463 +; unsigned char *ref_ptr, 1.464 +; int ref_stride, 1.465 +; int *results) 1.466 +global sym(vp8_sad8x16x3_sse3) PRIVATE 1.467 +sym(vp8_sad8x16x3_sse3): 1.468 + 1.469 + STACK_FRAME_CREATE_X3 1.470 + 1.471 + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.472 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.473 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.474 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.475 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.476 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.477 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.478 + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.479 + 1.480 + mov rcx, result_ptr 1.481 + 1.482 + punpckldq mm5, mm6 1.483 + 1.484 + movq [rcx], mm5 1.485 + movd [rcx+8], mm7 1.486 + 1.487 + STACK_FRAME_DESTROY_X3 1.488 + 1.489 +;void int vp8_sad8x8x3_sse3( 1.490 +; unsigned char *src_ptr, 1.491 +; int src_stride, 1.492 +; unsigned char *ref_ptr, 1.493 +; int ref_stride, 1.494 +; int *results) 1.495 +global sym(vp8_sad8x8x3_sse3) PRIVATE 1.496 +sym(vp8_sad8x8x3_sse3): 1.497 + 1.498 + STACK_FRAME_CREATE_X3 1.499 + 1.500 + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.501 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.502 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.503 + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.504 + 1.505 + mov rcx, result_ptr 1.506 + 1.507 + punpckldq mm5, mm6 1.508 + 1.509 + movq [rcx], mm5 1.510 + movd [rcx+8], mm7 1.511 + 1.512 + STACK_FRAME_DESTROY_X3 1.513 + 1.514 +;void int vp8_sad4x4x3_sse3( 1.515 +; unsigned char *src_ptr, 1.516 +; int src_stride, 1.517 +; unsigned char *ref_ptr, 1.518 +; int ref_stride, 1.519 +; int *results) 1.520 +global sym(vp8_sad4x4x3_sse3) PRIVATE 1.521 +sym(vp8_sad4x4x3_sse3): 1.522 + 1.523 + STACK_FRAME_CREATE_X3 1.524 + 1.525 + movd mm0, DWORD PTR [src_ptr] 1.526 + movd mm1, DWORD PTR [ref_ptr] 1.527 + 1.528 + movd mm2, DWORD PTR [src_ptr+src_stride] 1.529 + movd mm3, DWORD PTR [ref_ptr+ref_stride] 1.530 + 1.531 + punpcklbw mm0, mm2 1.532 + punpcklbw mm1, mm3 1.533 + 1.534 + movd mm4, DWORD PTR [ref_ptr+1] 1.535 + movd mm5, DWORD PTR [ref_ptr+2] 1.536 + 1.537 + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 1.538 + movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 1.539 + 1.540 + psadbw mm1, mm0 1.541 + 1.542 + punpcklbw mm4, mm2 1.543 + punpcklbw mm5, mm3 1.544 + 1.545 + psadbw mm4, mm0 1.546 + psadbw mm5, mm0 1.547 + 1.548 + lea src_ptr, [src_ptr+src_stride*2] 1.549 + lea ref_ptr, [ref_ptr+ref_stride*2] 1.550 + 1.551 + movd mm0, DWORD PTR [src_ptr] 1.552 + movd mm2, DWORD PTR [ref_ptr] 1.553 + 1.554 + movd mm3, DWORD PTR [src_ptr+src_stride] 1.555 + movd mm6, DWORD PTR [ref_ptr+ref_stride] 1.556 + 1.557 + punpcklbw mm0, mm3 1.558 + punpcklbw mm2, mm6 1.559 + 1.560 + movd mm3, DWORD PTR [ref_ptr+1] 1.561 + movd mm7, DWORD PTR [ref_ptr+2] 1.562 + 1.563 + psadbw mm2, mm0 1.564 + 1.565 + paddw mm1, mm2 1.566 + 1.567 + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 1.568 + movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 1.569 + 1.570 + punpcklbw mm3, mm2 1.571 + punpcklbw mm7, mm6 1.572 + 1.573 + psadbw mm3, mm0 1.574 + psadbw mm7, mm0 1.575 + 1.576 + paddw mm3, mm4 1.577 + paddw mm7, mm5 1.578 + 1.579 + mov rcx, result_ptr 1.580 + 1.581 + punpckldq mm1, mm3 1.582 + 1.583 + movq [rcx], mm1 1.584 + movd [rcx+8], mm7 1.585 + 1.586 + STACK_FRAME_DESTROY_X3 1.587 + 1.588 +;unsigned int vp8_sad16x16_sse3( 1.589 +; unsigned char *src_ptr, 1.590 +; int src_stride, 1.591 +; unsigned char *ref_ptr, 1.592 +; int ref_stride, 1.593 +; int max_sad) 1.594 +;%define lddqu movdqu 1.595 +global sym(vp8_sad16x16_sse3) PRIVATE 1.596 +sym(vp8_sad16x16_sse3): 1.597 + 1.598 + STACK_FRAME_CREATE_X3 1.599 + 1.600 + mov end_ptr, 4 1.601 + pxor xmm7, xmm7 1.602 + 1.603 +.vp8_sad16x16_sse3_loop: 1.604 + movdqa xmm0, XMMWORD PTR [src_ptr] 1.605 + movdqu xmm1, XMMWORD PTR [ref_ptr] 1.606 + movdqa xmm2, XMMWORD PTR [src_ptr+src_stride] 1.607 + movdqu xmm3, XMMWORD PTR [ref_ptr+ref_stride] 1.608 + 1.609 + lea src_ptr, [src_ptr+src_stride*2] 1.610 + lea ref_ptr, [ref_ptr+ref_stride*2] 1.611 + 1.612 + movdqa xmm4, XMMWORD PTR [src_ptr] 1.613 + movdqu xmm5, XMMWORD PTR [ref_ptr] 1.614 + movdqa xmm6, XMMWORD PTR [src_ptr+src_stride] 1.615 + 1.616 + psadbw xmm0, xmm1 1.617 + 1.618 + movdqu xmm1, XMMWORD PTR [ref_ptr+ref_stride] 1.619 + 1.620 + psadbw xmm2, xmm3 1.621 + psadbw xmm4, xmm5 1.622 + psadbw xmm6, xmm1 1.623 + 1.624 + lea src_ptr, [src_ptr+src_stride*2] 1.625 + lea ref_ptr, [ref_ptr+ref_stride*2] 1.626 + 1.627 + paddw xmm7, xmm0 1.628 + paddw xmm7, xmm2 1.629 + paddw xmm7, xmm4 1.630 + paddw xmm7, xmm6 1.631 + 1.632 + sub end_ptr, 1 1.633 + jne .vp8_sad16x16_sse3_loop 1.634 + 1.635 + movq xmm0, xmm7 1.636 + psrldq xmm7, 8 1.637 + paddw xmm0, xmm7 1.638 + movq rax, xmm0 1.639 + 1.640 + STACK_FRAME_DESTROY_X3 1.641 + 1.642 +;void vp8_copy32xn_sse3( 1.643 +; unsigned char *src_ptr, 1.644 +; int src_stride, 1.645 +; unsigned char *dst_ptr, 1.646 +; int dst_stride, 1.647 +; int height); 1.648 +global sym(vp8_copy32xn_sse3) PRIVATE 1.649 +sym(vp8_copy32xn_sse3): 1.650 + 1.651 + STACK_FRAME_CREATE_X3 1.652 + 1.653 +.block_copy_sse3_loopx4: 1.654 + lea end_ptr, [src_ptr+src_stride*2] 1.655 + 1.656 + movdqu xmm0, XMMWORD PTR [src_ptr] 1.657 + movdqu xmm1, XMMWORD PTR [src_ptr + 16] 1.658 + movdqu xmm2, XMMWORD PTR [src_ptr + src_stride] 1.659 + movdqu xmm3, XMMWORD PTR [src_ptr + src_stride + 16] 1.660 + movdqu xmm4, XMMWORD PTR [end_ptr] 1.661 + movdqu xmm5, XMMWORD PTR [end_ptr + 16] 1.662 + movdqu xmm6, XMMWORD PTR [end_ptr + src_stride] 1.663 + movdqu xmm7, XMMWORD PTR [end_ptr + src_stride + 16] 1.664 + 1.665 + lea src_ptr, [src_ptr+src_stride*4] 1.666 + 1.667 + lea end_ptr, [ref_ptr+ref_stride*2] 1.668 + 1.669 + movdqa XMMWORD PTR [ref_ptr], xmm0 1.670 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 1.671 + movdqa XMMWORD PTR [ref_ptr + ref_stride], xmm2 1.672 + movdqa XMMWORD PTR [ref_ptr + ref_stride + 16], xmm3 1.673 + movdqa XMMWORD PTR [end_ptr], xmm4 1.674 + movdqa XMMWORD PTR [end_ptr + 16], xmm5 1.675 + movdqa XMMWORD PTR [end_ptr + ref_stride], xmm6 1.676 + movdqa XMMWORD PTR [end_ptr + ref_stride + 16], xmm7 1.677 + 1.678 + lea ref_ptr, [ref_ptr+ref_stride*4] 1.679 + 1.680 + sub height, 4 1.681 + cmp height, 4 1.682 + jge .block_copy_sse3_loopx4 1.683 + 1.684 + ;Check to see if there is more rows need to be copied. 1.685 + cmp height, 0 1.686 + je .copy_is_done 1.687 + 1.688 +.block_copy_sse3_loop: 1.689 + movdqu xmm0, XMMWORD PTR [src_ptr] 1.690 + movdqu xmm1, XMMWORD PTR [src_ptr + 16] 1.691 + lea src_ptr, [src_ptr+src_stride] 1.692 + 1.693 + movdqa XMMWORD PTR [ref_ptr], xmm0 1.694 + movdqa XMMWORD PTR [ref_ptr + 16], xmm1 1.695 + lea ref_ptr, [ref_ptr+ref_stride] 1.696 + 1.697 + sub height, 1 1.698 + jne .block_copy_sse3_loop 1.699 + 1.700 +.copy_is_done: 1.701 + STACK_FRAME_DESTROY_X3 1.702 + 1.703 +;void vp8_sad16x16x4d_sse3( 1.704 +; unsigned char *src_ptr, 1.705 +; int src_stride, 1.706 +; unsigned char *ref_ptr_base, 1.707 +; int ref_stride, 1.708 +; int *results) 1.709 +global sym(vp8_sad16x16x4d_sse3) PRIVATE 1.710 +sym(vp8_sad16x16x4d_sse3): 1.711 + 1.712 + STACK_FRAME_CREATE_X4 1.713 + 1.714 + PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.715 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.716 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.717 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.718 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.719 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.720 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.721 + PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.722 + 1.723 +%if ABI_IS_32BIT 1.724 + pop rbp 1.725 +%endif 1.726 + mov rcx, result_ptr 1.727 + 1.728 + movq xmm0, xmm4 1.729 + psrldq xmm4, 8 1.730 + 1.731 + paddw xmm0, xmm4 1.732 + movd [rcx], xmm0 1.733 +;- 1.734 + movq xmm0, xmm5 1.735 + psrldq xmm5, 8 1.736 + 1.737 + paddw xmm0, xmm5 1.738 + movd [rcx+4], xmm0 1.739 +;- 1.740 + movq xmm0, xmm6 1.741 + psrldq xmm6, 8 1.742 + 1.743 + paddw xmm0, xmm6 1.744 + movd [rcx+8], xmm0 1.745 +;- 1.746 + movq xmm0, xmm7 1.747 + psrldq xmm7, 8 1.748 + 1.749 + paddw xmm0, xmm7 1.750 + movd [rcx+12], xmm0 1.751 + 1.752 + STACK_FRAME_DESTROY_X4 1.753 + 1.754 +;void vp8_sad16x8x4d_sse3( 1.755 +; unsigned char *src_ptr, 1.756 +; int src_stride, 1.757 +; unsigned char *ref_ptr_base, 1.758 +; int ref_stride, 1.759 +; int *results) 1.760 +global sym(vp8_sad16x8x4d_sse3) PRIVATE 1.761 +sym(vp8_sad16x8x4d_sse3): 1.762 + 1.763 + STACK_FRAME_CREATE_X4 1.764 + 1.765 + PROCESS_16X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.766 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.767 + PROCESS_16X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.768 + PROCESS_16X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.769 + 1.770 +%if ABI_IS_32BIT 1.771 + pop rbp 1.772 +%endif 1.773 + mov rcx, result_ptr 1.774 + 1.775 + movq xmm0, xmm4 1.776 + psrldq xmm4, 8 1.777 + 1.778 + paddw xmm0, xmm4 1.779 + movd [rcx], xmm0 1.780 +;- 1.781 + movq xmm0, xmm5 1.782 + psrldq xmm5, 8 1.783 + 1.784 + paddw xmm0, xmm5 1.785 + movd [rcx+4], xmm0 1.786 +;- 1.787 + movq xmm0, xmm6 1.788 + psrldq xmm6, 8 1.789 + 1.790 + paddw xmm0, xmm6 1.791 + movd [rcx+8], xmm0 1.792 +;- 1.793 + movq xmm0, xmm7 1.794 + psrldq xmm7, 8 1.795 + 1.796 + paddw xmm0, xmm7 1.797 + movd [rcx+12], xmm0 1.798 + 1.799 + STACK_FRAME_DESTROY_X4 1.800 + 1.801 +;void int vp8_sad8x16x4d_sse3( 1.802 +; unsigned char *src_ptr, 1.803 +; int src_stride, 1.804 +; unsigned char *ref_ptr, 1.805 +; int ref_stride, 1.806 +; int *results) 1.807 +global sym(vp8_sad8x16x4d_sse3) PRIVATE 1.808 +sym(vp8_sad8x16x4d_sse3): 1.809 + 1.810 + STACK_FRAME_CREATE_X4 1.811 + 1.812 + PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.813 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.814 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.815 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.816 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.817 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.818 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.819 + PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.820 + 1.821 +%if ABI_IS_32BIT 1.822 + pop rbp 1.823 +%endif 1.824 + mov rcx, result_ptr 1.825 + 1.826 + punpckldq mm4, mm5 1.827 + punpckldq mm6, mm7 1.828 + 1.829 + movq [rcx], mm4 1.830 + movq [rcx+8], mm6 1.831 + 1.832 + STACK_FRAME_DESTROY_X4 1.833 + 1.834 +;void int vp8_sad8x8x4d_sse3( 1.835 +; unsigned char *src_ptr, 1.836 +; int src_stride, 1.837 +; unsigned char *ref_ptr, 1.838 +; int ref_stride, 1.839 +; int *results) 1.840 +global sym(vp8_sad8x8x4d_sse3) PRIVATE 1.841 +sym(vp8_sad8x8x4d_sse3): 1.842 + 1.843 + STACK_FRAME_CREATE_X4 1.844 + 1.845 + PROCESS_8X2X4 0, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.846 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.847 + PROCESS_8X2X4 1, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.848 + PROCESS_8X2X4 2, src_ptr, r0_ptr, r1_ptr, r2_ptr, r3_ptr, src_stride, ref_stride 1.849 + 1.850 +%if ABI_IS_32BIT 1.851 + pop rbp 1.852 +%endif 1.853 + mov rcx, result_ptr 1.854 + 1.855 + punpckldq mm4, mm5 1.856 + punpckldq mm6, mm7 1.857 + 1.858 + movq [rcx], mm4 1.859 + movq [rcx+8], mm6 1.860 + 1.861 + STACK_FRAME_DESTROY_X4 1.862 + 1.863 +;void int vp8_sad4x4x4d_sse3( 1.864 +; unsigned char *src_ptr, 1.865 +; int src_stride, 1.866 +; unsigned char *ref_ptr, 1.867 +; int ref_stride, 1.868 +; int *results) 1.869 +global sym(vp8_sad4x4x4d_sse3) PRIVATE 1.870 +sym(vp8_sad4x4x4d_sse3): 1.871 + 1.872 + STACK_FRAME_CREATE_X4 1.873 + 1.874 + movd mm0, DWORD PTR [src_ptr] 1.875 + movd mm1, DWORD PTR [r0_ptr] 1.876 + 1.877 + movd mm2, DWORD PTR [src_ptr+src_stride] 1.878 + movd mm3, DWORD PTR [r0_ptr+ref_stride] 1.879 + 1.880 + punpcklbw mm0, mm2 1.881 + punpcklbw mm1, mm3 1.882 + 1.883 + movd mm4, DWORD PTR [r1_ptr] 1.884 + movd mm5, DWORD PTR [r2_ptr] 1.885 + 1.886 + movd mm6, DWORD PTR [r3_ptr] 1.887 + movd mm2, DWORD PTR [r1_ptr+ref_stride] 1.888 + 1.889 + movd mm3, DWORD PTR [r2_ptr+ref_stride] 1.890 + movd mm7, DWORD PTR [r3_ptr+ref_stride] 1.891 + 1.892 + psadbw mm1, mm0 1.893 + 1.894 + punpcklbw mm4, mm2 1.895 + punpcklbw mm5, mm3 1.896 + 1.897 + punpcklbw mm6, mm7 1.898 + psadbw mm4, mm0 1.899 + 1.900 + psadbw mm5, mm0 1.901 + psadbw mm6, mm0 1.902 + 1.903 + 1.904 + 1.905 + lea src_ptr, [src_ptr+src_stride*2] 1.906 + lea r0_ptr, [r0_ptr+ref_stride*2] 1.907 + 1.908 + lea r1_ptr, [r1_ptr+ref_stride*2] 1.909 + lea r2_ptr, [r2_ptr+ref_stride*2] 1.910 + 1.911 + lea r3_ptr, [r3_ptr+ref_stride*2] 1.912 + 1.913 + movd mm0, DWORD PTR [src_ptr] 1.914 + movd mm2, DWORD PTR [r0_ptr] 1.915 + 1.916 + movd mm3, DWORD PTR [src_ptr+src_stride] 1.917 + movd mm7, DWORD PTR [r0_ptr+ref_stride] 1.918 + 1.919 + punpcklbw mm0, mm3 1.920 + punpcklbw mm2, mm7 1.921 + 1.922 + movd mm3, DWORD PTR [r1_ptr] 1.923 + movd mm7, DWORD PTR [r2_ptr] 1.924 + 1.925 + psadbw mm2, mm0 1.926 +%if ABI_IS_32BIT 1.927 + mov rax, rbp 1.928 + 1.929 + pop rbp 1.930 +%define ref_stride rax 1.931 +%endif 1.932 + mov rsi, result_ptr 1.933 + 1.934 + paddw mm1, mm2 1.935 + movd [rsi], mm1 1.936 + 1.937 + movd mm2, DWORD PTR [r1_ptr+ref_stride] 1.938 + movd mm1, DWORD PTR [r2_ptr+ref_stride] 1.939 + 1.940 + punpcklbw mm3, mm2 1.941 + punpcklbw mm7, mm1 1.942 + 1.943 + psadbw mm3, mm0 1.944 + psadbw mm7, mm0 1.945 + 1.946 + movd mm2, DWORD PTR [r3_ptr] 1.947 + movd mm1, DWORD PTR [r3_ptr+ref_stride] 1.948 + 1.949 + paddw mm3, mm4 1.950 + paddw mm7, mm5 1.951 + 1.952 + movd [rsi+4], mm3 1.953 + punpcklbw mm2, mm1 1.954 + 1.955 + movd [rsi+8], mm7 1.956 + psadbw mm2, mm0 1.957 + 1.958 + paddw mm2, mm6 1.959 + movd [rsi+12], mm2 1.960 + 1.961 + 1.962 + STACK_FRAME_DESTROY_X4 1.963 +