1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/variance_impl_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,851 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;unsigned int vp8_get_mb_ss_mmx( short *src_ptr ) 1.18 +global sym(vp8_get_mb_ss_mmx) PRIVATE 1.19 +sym(vp8_get_mb_ss_mmx): 1.20 + push rbp 1.21 + mov rbp, rsp 1.22 + SHADOW_ARGS_TO_STACK 7 1.23 + GET_GOT rbx 1.24 + push rsi 1.25 + push rdi 1.26 + sub rsp, 8 1.27 + ; end prolog 1.28 + 1.29 + mov rax, arg(0) ;src_ptr 1.30 + mov rcx, 16 1.31 + pxor mm4, mm4 1.32 + 1.33 +.NEXTROW: 1.34 + movq mm0, [rax] 1.35 + movq mm1, [rax+8] 1.36 + movq mm2, [rax+16] 1.37 + movq mm3, [rax+24] 1.38 + pmaddwd mm0, mm0 1.39 + pmaddwd mm1, mm1 1.40 + pmaddwd mm2, mm2 1.41 + pmaddwd mm3, mm3 1.42 + 1.43 + paddd mm4, mm0 1.44 + paddd mm4, mm1 1.45 + paddd mm4, mm2 1.46 + paddd mm4, mm3 1.47 + 1.48 + add rax, 32 1.49 + dec rcx 1.50 + ja .NEXTROW 1.51 + movq QWORD PTR [rsp], mm4 1.52 + 1.53 + ;return sum[0]+sum[1]; 1.54 + movsxd rax, dword ptr [rsp] 1.55 + movsxd rcx, dword ptr [rsp+4] 1.56 + add rax, rcx 1.57 + 1.58 + 1.59 + ; begin epilog 1.60 + add rsp, 8 1.61 + pop rdi 1.62 + pop rsi 1.63 + RESTORE_GOT 1.64 + UNSHADOW_ARGS 1.65 + pop rbp 1.66 + ret 1.67 + 1.68 + 1.69 +;unsigned int vp8_get8x8var_mmx 1.70 +;( 1.71 +; unsigned char *src_ptr, 1.72 +; int source_stride, 1.73 +; unsigned char *ref_ptr, 1.74 +; int recon_stride, 1.75 +; unsigned int *SSE, 1.76 +; int *Sum 1.77 +;) 1.78 +global sym(vp8_get8x8var_mmx) PRIVATE 1.79 +sym(vp8_get8x8var_mmx): 1.80 + push rbp 1.81 + mov rbp, rsp 1.82 + SHADOW_ARGS_TO_STACK 6 1.83 + push rsi 1.84 + push rdi 1.85 + push rbx 1.86 + sub rsp, 16 1.87 + ; end prolog 1.88 + 1.89 + 1.90 + pxor mm5, mm5 ; Blank mmx6 1.91 + pxor mm6, mm6 ; Blank mmx7 1.92 + pxor mm7, mm7 ; Blank mmx7 1.93 + 1.94 + mov rax, arg(0) ;[src_ptr] ; Load base addresses 1.95 + mov rbx, arg(2) ;[ref_ptr] 1.96 + movsxd rcx, dword ptr arg(1) ;[source_stride] 1.97 + movsxd rdx, dword ptr arg(3) ;[recon_stride] 1.98 + 1.99 + ; Row 1 1.100 + movq mm0, [rax] ; Copy eight bytes to mm0 1.101 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.102 + movq mm2, mm0 ; Take copies 1.103 + movq mm3, mm1 ; Take copies 1.104 + 1.105 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.106 + punpcklbw mm1, mm6 1.107 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.108 + punpckhbw mm3, mm6 1.109 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.110 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.111 + 1.112 + paddw mm5, mm0 ; accumulate differences in mm5 1.113 + paddw mm5, mm2 ; accumulate differences in mm5 1.114 + 1.115 + pmaddwd mm0, mm0 ; square and accumulate 1.116 + pmaddwd mm2, mm2 ; square and accumulate 1.117 + add rbx,rdx ; Inc pointer into ref data 1.118 + add rax,rcx ; Inc pointer into the new data 1.119 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.120 + paddd mm7, mm0 ; accumulate in mm7 1.121 + paddd mm7, mm2 ; accumulate in mm7 1.122 + 1.123 + 1.124 + ; Row 2 1.125 + movq mm0, [rax] ; Copy eight bytes to mm0 1.126 + movq mm2, mm0 ; Take copies 1.127 + movq mm3, mm1 ; Take copies 1.128 + 1.129 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.130 + punpcklbw mm1, mm6 1.131 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.132 + punpckhbw mm3, mm6 1.133 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.134 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.135 + 1.136 + paddw mm5, mm0 ; accumulate differences in mm5 1.137 + paddw mm5, mm2 ; accumulate differences in mm5 1.138 + 1.139 + pmaddwd mm0, mm0 ; square and accumulate 1.140 + pmaddwd mm2, mm2 ; square and accumulate 1.141 + add rbx,rdx ; Inc pointer into ref data 1.142 + add rax,rcx ; Inc pointer into the new data 1.143 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.144 + paddd mm7, mm0 ; accumulate in mm7 1.145 + paddd mm7, mm2 ; accumulate in mm7 1.146 + 1.147 + ; Row 3 1.148 + movq mm0, [rax] ; Copy eight bytes to mm0 1.149 + movq mm2, mm0 ; Take copies 1.150 + movq mm3, mm1 ; Take copies 1.151 + 1.152 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.153 + punpcklbw mm1, mm6 1.154 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.155 + punpckhbw mm3, mm6 1.156 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.157 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.158 + 1.159 + paddw mm5, mm0 ; accumulate differences in mm5 1.160 + paddw mm5, mm2 ; accumulate differences in mm5 1.161 + 1.162 + pmaddwd mm0, mm0 ; square and accumulate 1.163 + pmaddwd mm2, mm2 ; square and accumulate 1.164 + add rbx,rdx ; Inc pointer into ref data 1.165 + add rax,rcx ; Inc pointer into the new data 1.166 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.167 + paddd mm7, mm0 ; accumulate in mm7 1.168 + paddd mm7, mm2 ; accumulate in mm7 1.169 + 1.170 + ; Row 4 1.171 + movq mm0, [rax] ; Copy eight bytes to mm0 1.172 + movq mm2, mm0 ; Take copies 1.173 + movq mm3, mm1 ; Take copies 1.174 + 1.175 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.176 + punpcklbw mm1, mm6 1.177 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.178 + punpckhbw mm3, mm6 1.179 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.180 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.181 + 1.182 + paddw mm5, mm0 ; accumulate differences in mm5 1.183 + paddw mm5, mm2 ; accumulate differences in mm5 1.184 + 1.185 + pmaddwd mm0, mm0 ; square and accumulate 1.186 + pmaddwd mm2, mm2 ; square and accumulate 1.187 + add rbx,rdx ; Inc pointer into ref data 1.188 + add rax,rcx ; Inc pointer into the new data 1.189 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.190 + paddd mm7, mm0 ; accumulate in mm7 1.191 + paddd mm7, mm2 ; accumulate in mm7 1.192 + 1.193 + ; Row 5 1.194 + movq mm0, [rax] ; Copy eight bytes to mm0 1.195 + movq mm2, mm0 ; Take copies 1.196 + movq mm3, mm1 ; Take copies 1.197 + 1.198 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.199 + punpcklbw mm1, mm6 1.200 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.201 + punpckhbw mm3, mm6 1.202 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.203 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.204 + 1.205 + paddw mm5, mm0 ; accumulate differences in mm5 1.206 + paddw mm5, mm2 ; accumulate differences in mm5 1.207 + 1.208 + pmaddwd mm0, mm0 ; square and accumulate 1.209 + pmaddwd mm2, mm2 ; square and accumulate 1.210 + add rbx,rdx ; Inc pointer into ref data 1.211 + add rax,rcx ; Inc pointer into the new data 1.212 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.213 + ; movq mm4, [rbx + rdx] 1.214 + paddd mm7, mm0 ; accumulate in mm7 1.215 + paddd mm7, mm2 ; accumulate in mm7 1.216 + 1.217 + ; Row 6 1.218 + movq mm0, [rax] ; Copy eight bytes to mm0 1.219 + movq mm2, mm0 ; Take copies 1.220 + movq mm3, mm1 ; Take copies 1.221 + 1.222 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.223 + punpcklbw mm1, mm6 1.224 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.225 + punpckhbw mm3, mm6 1.226 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.227 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.228 + 1.229 + paddw mm5, mm0 ; accumulate differences in mm5 1.230 + paddw mm5, mm2 ; accumulate differences in mm5 1.231 + 1.232 + pmaddwd mm0, mm0 ; square and accumulate 1.233 + pmaddwd mm2, mm2 ; square and accumulate 1.234 + add rbx,rdx ; Inc pointer into ref data 1.235 + add rax,rcx ; Inc pointer into the new data 1.236 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.237 + paddd mm7, mm0 ; accumulate in mm7 1.238 + paddd mm7, mm2 ; accumulate in mm7 1.239 + 1.240 + ; Row 7 1.241 + movq mm0, [rax] ; Copy eight bytes to mm0 1.242 + movq mm2, mm0 ; Take copies 1.243 + movq mm3, mm1 ; Take copies 1.244 + 1.245 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.246 + punpcklbw mm1, mm6 1.247 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.248 + punpckhbw mm3, mm6 1.249 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.250 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.251 + 1.252 + paddw mm5, mm0 ; accumulate differences in mm5 1.253 + paddw mm5, mm2 ; accumulate differences in mm5 1.254 + 1.255 + pmaddwd mm0, mm0 ; square and accumulate 1.256 + pmaddwd mm2, mm2 ; square and accumulate 1.257 + add rbx,rdx ; Inc pointer into ref data 1.258 + add rax,rcx ; Inc pointer into the new data 1.259 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.260 + paddd mm7, mm0 ; accumulate in mm7 1.261 + paddd mm7, mm2 ; accumulate in mm7 1.262 + 1.263 + ; Row 8 1.264 + movq mm0, [rax] ; Copy eight bytes to mm0 1.265 + movq mm2, mm0 ; Take copies 1.266 + movq mm3, mm1 ; Take copies 1.267 + 1.268 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.269 + punpcklbw mm1, mm6 1.270 + punpckhbw mm2, mm6 ; unpack to higher prrcision 1.271 + punpckhbw mm3, mm6 1.272 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.273 + psubsw mm2, mm3 ; A-B (high order) to MM2 1.274 + 1.275 + paddw mm5, mm0 ; accumulate differences in mm5 1.276 + paddw mm5, mm2 ; accumulate differences in mm5 1.277 + 1.278 + pmaddwd mm0, mm0 ; square and accumulate 1.279 + pmaddwd mm2, mm2 ; square and accumulate 1.280 + add rbx,rdx ; Inc pointer into ref data 1.281 + add rax,rcx ; Inc pointer into the new data 1.282 + paddd mm7, mm0 ; accumulate in mm7 1.283 + paddd mm7, mm2 ; accumulate in mm7 1.284 + 1.285 + ; Now accumulate the final results. 1.286 + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 1.287 + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 1.288 + movsx rdx, WORD PTR [rsp+8] 1.289 + movsx rcx, WORD PTR [rsp+10] 1.290 + movsx rbx, WORD PTR [rsp+12] 1.291 + movsx rax, WORD PTR [rsp+14] 1.292 + add rdx, rcx 1.293 + add rbx, rax 1.294 + add rdx, rbx ;XSum 1.295 + movsxd rax, DWORD PTR [rsp] 1.296 + movsxd rcx, DWORD PTR [rsp+4] 1.297 + add rax, rcx ;XXSum 1.298 + mov rsi, arg(4) ;SSE 1.299 + mov rdi, arg(5) ;Sum 1.300 + mov dword ptr [rsi], eax 1.301 + mov dword ptr [rdi], edx 1.302 + xor rax, rax ; return 0 1.303 + 1.304 + 1.305 + ; begin epilog 1.306 + add rsp, 16 1.307 + pop rbx 1.308 + pop rdi 1.309 + pop rsi 1.310 + UNSHADOW_ARGS 1.311 + pop rbp 1.312 + ret 1.313 + 1.314 + 1.315 + 1.316 +;unsigned int 1.317 +;vp8_get4x4var_mmx 1.318 +;( 1.319 +; unsigned char *src_ptr, 1.320 +; int source_stride, 1.321 +; unsigned char *ref_ptr, 1.322 +; int recon_stride, 1.323 +; unsigned int *SSE, 1.324 +; int *Sum 1.325 +;) 1.326 +global sym(vp8_get4x4var_mmx) PRIVATE 1.327 +sym(vp8_get4x4var_mmx): 1.328 + push rbp 1.329 + mov rbp, rsp 1.330 + SHADOW_ARGS_TO_STACK 6 1.331 + push rsi 1.332 + push rdi 1.333 + push rbx 1.334 + sub rsp, 16 1.335 + ; end prolog 1.336 + 1.337 + 1.338 + pxor mm5, mm5 ; Blank mmx6 1.339 + pxor mm6, mm6 ; Blank mmx7 1.340 + pxor mm7, mm7 ; Blank mmx7 1.341 + 1.342 + mov rax, arg(0) ;[src_ptr] ; Load base addresses 1.343 + mov rbx, arg(2) ;[ref_ptr] 1.344 + movsxd rcx, dword ptr arg(1) ;[source_stride] 1.345 + movsxd rdx, dword ptr arg(3) ;[recon_stride] 1.346 + 1.347 + ; Row 1 1.348 + movq mm0, [rax] ; Copy eight bytes to mm0 1.349 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.350 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.351 + punpcklbw mm1, mm6 1.352 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.353 + paddw mm5, mm0 ; accumulate differences in mm5 1.354 + pmaddwd mm0, mm0 ; square and accumulate 1.355 + add rbx,rdx ; Inc pointer into ref data 1.356 + add rax,rcx ; Inc pointer into the new data 1.357 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.358 + paddd mm7, mm0 ; accumulate in mm7 1.359 + 1.360 + 1.361 + ; Row 2 1.362 + movq mm0, [rax] ; Copy eight bytes to mm0 1.363 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.364 + punpcklbw mm1, mm6 1.365 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.366 + paddw mm5, mm0 ; accumulate differences in mm5 1.367 + 1.368 + pmaddwd mm0, mm0 ; square and accumulate 1.369 + add rbx,rdx ; Inc pointer into ref data 1.370 + add rax,rcx ; Inc pointer into the new data 1.371 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.372 + paddd mm7, mm0 ; accumulate in mm7 1.373 + 1.374 + ; Row 3 1.375 + movq mm0, [rax] ; Copy eight bytes to mm0 1.376 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.377 + punpcklbw mm1, mm6 1.378 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.379 + paddw mm5, mm0 ; accumulate differences in mm5 1.380 + 1.381 + pmaddwd mm0, mm0 ; square and accumulate 1.382 + add rbx,rdx ; Inc pointer into ref data 1.383 + add rax,rcx ; Inc pointer into the new data 1.384 + movq mm1, [rbx] ; Copy eight bytes to mm1 1.385 + paddd mm7, mm0 ; accumulate in mm7 1.386 + 1.387 + ; Row 4 1.388 + movq mm0, [rax] ; Copy eight bytes to mm0 1.389 + 1.390 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.391 + punpcklbw mm1, mm6 1.392 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.393 + 1.394 + paddw mm5, mm0 ; accumulate differences in mm5 1.395 + 1.396 + pmaddwd mm0, mm0 ; square and accumulate 1.397 + paddd mm7, mm0 ; accumulate in mm7 1.398 + 1.399 + 1.400 + ; Now accumulate the final results. 1.401 + movq QWORD PTR [rsp+8], mm5 ; copy back accumulated results into normal memory 1.402 + movq QWORD PTR [rsp], mm7 ; copy back accumulated results into normal memory 1.403 + movsx rdx, WORD PTR [rsp+8] 1.404 + movsx rcx, WORD PTR [rsp+10] 1.405 + movsx rbx, WORD PTR [rsp+12] 1.406 + movsx rax, WORD PTR [rsp+14] 1.407 + add rdx, rcx 1.408 + add rbx, rax 1.409 + add rdx, rbx ;XSum 1.410 + movsxd rax, DWORD PTR [rsp] 1.411 + movsxd rcx, DWORD PTR [rsp+4] 1.412 + add rax, rcx ;XXSum 1.413 + mov rsi, arg(4) ;SSE 1.414 + mov rdi, arg(5) ;Sum 1.415 + mov dword ptr [rsi], eax 1.416 + mov dword ptr [rdi], edx 1.417 + xor rax, rax ; return 0 1.418 + 1.419 + 1.420 + ; begin epilog 1.421 + add rsp, 16 1.422 + pop rbx 1.423 + pop rdi 1.424 + pop rsi 1.425 + UNSHADOW_ARGS 1.426 + pop rbp 1.427 + ret 1.428 + 1.429 + 1.430 + 1.431 +;unsigned int 1.432 +;vp8_get4x4sse_cs_mmx 1.433 +;( 1.434 +; unsigned char *src_ptr, 1.435 +; int source_stride, 1.436 +; unsigned char *ref_ptr, 1.437 +; int recon_stride 1.438 +;) 1.439 +global sym(vp8_get4x4sse_cs_mmx) PRIVATE 1.440 +sym(vp8_get4x4sse_cs_mmx): 1.441 + push rbp 1.442 + mov rbp, rsp 1.443 + SHADOW_ARGS_TO_STACK 4 1.444 + push rsi 1.445 + push rdi 1.446 + push rbx 1.447 + ; end prolog 1.448 + 1.449 + 1.450 + pxor mm6, mm6 ; Blank mmx7 1.451 + pxor mm7, mm7 ; Blank mmx7 1.452 + 1.453 + mov rax, arg(0) ;[src_ptr] ; Load base addresses 1.454 + mov rbx, arg(2) ;[ref_ptr] 1.455 + movsxd rcx, dword ptr arg(1) ;[source_stride] 1.456 + movsxd rdx, dword ptr arg(3) ;[recon_stride] 1.457 + ; Row 1 1.458 + movd mm0, [rax] ; Copy eight bytes to mm0 1.459 + movd mm1, [rbx] ; Copy eight bytes to mm1 1.460 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.461 + punpcklbw mm1, mm6 1.462 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.463 + pmaddwd mm0, mm0 ; square and accumulate 1.464 + add rbx,rdx ; Inc pointer into ref data 1.465 + add rax,rcx ; Inc pointer into the new data 1.466 + movd mm1, [rbx] ; Copy eight bytes to mm1 1.467 + paddd mm7, mm0 ; accumulate in mm7 1.468 + 1.469 + ; Row 2 1.470 + movd mm0, [rax] ; Copy eight bytes to mm0 1.471 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.472 + punpcklbw mm1, mm6 1.473 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.474 + pmaddwd mm0, mm0 ; square and accumulate 1.475 + add rbx,rdx ; Inc pointer into ref data 1.476 + add rax,rcx ; Inc pointer into the new data 1.477 + movd mm1, [rbx] ; Copy eight bytes to mm1 1.478 + paddd mm7, mm0 ; accumulate in mm7 1.479 + 1.480 + ; Row 3 1.481 + movd mm0, [rax] ; Copy eight bytes to mm0 1.482 + punpcklbw mm1, mm6 1.483 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.484 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.485 + 1.486 + pmaddwd mm0, mm0 ; square and accumulate 1.487 + add rbx,rdx ; Inc pointer into ref data 1.488 + add rax,rcx ; Inc pointer into the new data 1.489 + movd mm1, [rbx] ; Copy eight bytes to mm1 1.490 + paddd mm7, mm0 ; accumulate in mm7 1.491 + 1.492 + ; Row 4 1.493 + movd mm0, [rax] ; Copy eight bytes to mm0 1.494 + punpcklbw mm0, mm6 ; unpack to higher prrcision 1.495 + punpcklbw mm1, mm6 1.496 + psubsw mm0, mm1 ; A-B (low order) to MM0 1.497 + pmaddwd mm0, mm0 ; square and accumulate 1.498 + paddd mm7, mm0 ; accumulate in mm7 1.499 + 1.500 + movq mm0, mm7 ; 1.501 + psrlq mm7, 32 1.502 + 1.503 + paddd mm0, mm7 1.504 + movq rax, mm0 1.505 + 1.506 + 1.507 + ; begin epilog 1.508 + pop rbx 1.509 + pop rdi 1.510 + pop rsi 1.511 + UNSHADOW_ARGS 1.512 + pop rbp 1.513 + ret 1.514 + 1.515 +%define mmx_filter_shift 7 1.516 + 1.517 +;void vp8_filter_block2d_bil4x4_var_mmx 1.518 +;( 1.519 +; unsigned char *ref_ptr, 1.520 +; int ref_pixels_per_line, 1.521 +; unsigned char *src_ptr, 1.522 +; int src_pixels_per_line, 1.523 +; unsigned short *HFilter, 1.524 +; unsigned short *VFilter, 1.525 +; int *sum, 1.526 +; unsigned int *sumsquared 1.527 +;) 1.528 +global sym(vp8_filter_block2d_bil4x4_var_mmx) PRIVATE 1.529 +sym(vp8_filter_block2d_bil4x4_var_mmx): 1.530 + push rbp 1.531 + mov rbp, rsp 1.532 + SHADOW_ARGS_TO_STACK 8 1.533 + GET_GOT rbx 1.534 + push rsi 1.535 + push rdi 1.536 + sub rsp, 16 1.537 + ; end prolog 1.538 + 1.539 + 1.540 + pxor mm6, mm6 ; 1.541 + pxor mm7, mm7 ; 1.542 + 1.543 + mov rax, arg(4) ;HFilter ; 1.544 + mov rdx, arg(5) ;VFilter ; 1.545 + 1.546 + mov rsi, arg(0) ;ref_ptr ; 1.547 + mov rdi, arg(2) ;src_ptr ; 1.548 + 1.549 + mov rcx, 4 ; 1.550 + pxor mm0, mm0 ; 1.551 + 1.552 + movd mm1, [rsi] ; 1.553 + movd mm3, [rsi+1] ; 1.554 + 1.555 + punpcklbw mm1, mm0 ; 1.556 + pmullw mm1, [rax] ; 1.557 + 1.558 + punpcklbw mm3, mm0 ; 1.559 + pmullw mm3, [rax+8] ; 1.560 + 1.561 + paddw mm1, mm3 ; 1.562 + paddw mm1, [GLOBAL(mmx_bi_rd)] ; 1.563 + 1.564 + psraw mm1, mmx_filter_shift ; 1.565 + movq mm5, mm1 1.566 + 1.567 +%if ABI_IS_32BIT 1.568 + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 1.569 +%else 1.570 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 1.571 + add rsi, r8 1.572 +%endif 1.573 + 1.574 +.filter_block2d_bil4x4_var_mmx_loop: 1.575 + 1.576 + movd mm1, [rsi] ; 1.577 + movd mm3, [rsi+1] ; 1.578 + 1.579 + punpcklbw mm1, mm0 ; 1.580 + pmullw mm1, [rax] ; 1.581 + 1.582 + punpcklbw mm3, mm0 ; 1.583 + pmullw mm3, [rax+8] ; 1.584 + 1.585 + paddw mm1, mm3 ; 1.586 + paddw mm1, [GLOBAL(mmx_bi_rd)] ; 1.587 + 1.588 + psraw mm1, mmx_filter_shift ; 1.589 + movq mm3, mm5 ; 1.590 + 1.591 + movq mm5, mm1 ; 1.592 + pmullw mm3, [rdx] ; 1.593 + 1.594 + pmullw mm1, [rdx+8] ; 1.595 + paddw mm1, mm3 ; 1.596 + 1.597 + 1.598 + paddw mm1, [GLOBAL(mmx_bi_rd)] ; 1.599 + psraw mm1, mmx_filter_shift ; 1.600 + 1.601 + movd mm3, [rdi] ; 1.602 + punpcklbw mm3, mm0 ; 1.603 + 1.604 + psubw mm1, mm3 ; 1.605 + paddw mm6, mm1 ; 1.606 + 1.607 + pmaddwd mm1, mm1 ; 1.608 + paddd mm7, mm1 ; 1.609 + 1.610 +%if ABI_IS_32BIT 1.611 + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 1.612 + add rdi, dword ptr arg(3) ;src_pixels_per_line ; 1.613 +%else 1.614 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.615 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.616 + add rsi, r8 1.617 + add rdi, r9 1.618 +%endif 1.619 + sub rcx, 1 ; 1.620 + jnz .filter_block2d_bil4x4_var_mmx_loop ; 1.621 + 1.622 + 1.623 + pxor mm3, mm3 ; 1.624 + pxor mm2, mm2 ; 1.625 + 1.626 + punpcklwd mm2, mm6 ; 1.627 + punpckhwd mm3, mm6 ; 1.628 + 1.629 + paddd mm2, mm3 ; 1.630 + movq mm6, mm2 ; 1.631 + 1.632 + psrlq mm6, 32 ; 1.633 + paddd mm2, mm6 ; 1.634 + 1.635 + psrad mm2, 16 ; 1.636 + movq mm4, mm7 ; 1.637 + 1.638 + psrlq mm4, 32 ; 1.639 + paddd mm4, mm7 ; 1.640 + 1.641 + mov rdi, arg(6) ;sum 1.642 + mov rsi, arg(7) ;sumsquared 1.643 + 1.644 + movd dword ptr [rdi], mm2 ; 1.645 + movd dword ptr [rsi], mm4 ; 1.646 + 1.647 + 1.648 + 1.649 + ; begin epilog 1.650 + add rsp, 16 1.651 + pop rdi 1.652 + pop rsi 1.653 + RESTORE_GOT 1.654 + UNSHADOW_ARGS 1.655 + pop rbp 1.656 + ret 1.657 + 1.658 + 1.659 + 1.660 + 1.661 +;void vp8_filter_block2d_bil_var_mmx 1.662 +;( 1.663 +; unsigned char *ref_ptr, 1.664 +; int ref_pixels_per_line, 1.665 +; unsigned char *src_ptr, 1.666 +; int src_pixels_per_line, 1.667 +; unsigned int Height, 1.668 +; unsigned short *HFilter, 1.669 +; unsigned short *VFilter, 1.670 +; int *sum, 1.671 +; unsigned int *sumsquared 1.672 +;) 1.673 +global sym(vp8_filter_block2d_bil_var_mmx) PRIVATE 1.674 +sym(vp8_filter_block2d_bil_var_mmx): 1.675 + push rbp 1.676 + mov rbp, rsp 1.677 + SHADOW_ARGS_TO_STACK 9 1.678 + GET_GOT rbx 1.679 + push rsi 1.680 + push rdi 1.681 + sub rsp, 16 1.682 + ; end prolog 1.683 + 1.684 + pxor mm6, mm6 ; 1.685 + pxor mm7, mm7 ; 1.686 + mov rax, arg(5) ;HFilter ; 1.687 + 1.688 + mov rdx, arg(6) ;VFilter ; 1.689 + mov rsi, arg(0) ;ref_ptr ; 1.690 + 1.691 + mov rdi, arg(2) ;src_ptr ; 1.692 + movsxd rcx, dword ptr arg(4) ;Height ; 1.693 + 1.694 + pxor mm0, mm0 ; 1.695 + movq mm1, [rsi] ; 1.696 + 1.697 + movq mm3, [rsi+1] ; 1.698 + movq mm2, mm1 ; 1.699 + 1.700 + movq mm4, mm3 ; 1.701 + punpcklbw mm1, mm0 ; 1.702 + 1.703 + punpckhbw mm2, mm0 ; 1.704 + pmullw mm1, [rax] ; 1.705 + 1.706 + pmullw mm2, [rax] ; 1.707 + punpcklbw mm3, mm0 ; 1.708 + 1.709 + punpckhbw mm4, mm0 ; 1.710 + pmullw mm3, [rax+8] ; 1.711 + 1.712 + pmullw mm4, [rax+8] ; 1.713 + paddw mm1, mm3 ; 1.714 + 1.715 + paddw mm2, mm4 ; 1.716 + paddw mm1, [GLOBAL(mmx_bi_rd)] ; 1.717 + 1.718 + psraw mm1, mmx_filter_shift ; 1.719 + paddw mm2, [GLOBAL(mmx_bi_rd)] ; 1.720 + 1.721 + psraw mm2, mmx_filter_shift ; 1.722 + movq mm5, mm1 1.723 + 1.724 + packuswb mm5, mm2 ; 1.725 +%if ABI_IS_32BIT 1.726 + add rsi, dword ptr arg(1) ;ref_pixels_per_line 1.727 +%else 1.728 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.729 + add rsi, r8 1.730 +%endif 1.731 + 1.732 +.filter_block2d_bil_var_mmx_loop: 1.733 + 1.734 + movq mm1, [rsi] ; 1.735 + movq mm3, [rsi+1] ; 1.736 + 1.737 + movq mm2, mm1 ; 1.738 + movq mm4, mm3 ; 1.739 + 1.740 + punpcklbw mm1, mm0 ; 1.741 + punpckhbw mm2, mm0 ; 1.742 + 1.743 + pmullw mm1, [rax] ; 1.744 + pmullw mm2, [rax] ; 1.745 + 1.746 + punpcklbw mm3, mm0 ; 1.747 + punpckhbw mm4, mm0 ; 1.748 + 1.749 + pmullw mm3, [rax+8] ; 1.750 + pmullw mm4, [rax+8] ; 1.751 + 1.752 + paddw mm1, mm3 ; 1.753 + paddw mm2, mm4 ; 1.754 + 1.755 + paddw mm1, [GLOBAL(mmx_bi_rd)] ; 1.756 + psraw mm1, mmx_filter_shift ; 1.757 + 1.758 + paddw mm2, [GLOBAL(mmx_bi_rd)] ; 1.759 + psraw mm2, mmx_filter_shift ; 1.760 + 1.761 + movq mm3, mm5 ; 1.762 + movq mm4, mm5 ; 1.763 + 1.764 + punpcklbw mm3, mm0 ; 1.765 + punpckhbw mm4, mm0 ; 1.766 + 1.767 + movq mm5, mm1 ; 1.768 + packuswb mm5, mm2 ; 1.769 + 1.770 + pmullw mm3, [rdx] ; 1.771 + pmullw mm4, [rdx] ; 1.772 + 1.773 + pmullw mm1, [rdx+8] ; 1.774 + pmullw mm2, [rdx+8] ; 1.775 + 1.776 + paddw mm1, mm3 ; 1.777 + paddw mm2, mm4 ; 1.778 + 1.779 + paddw mm1, [GLOBAL(mmx_bi_rd)] ; 1.780 + paddw mm2, [GLOBAL(mmx_bi_rd)] ; 1.781 + 1.782 + psraw mm1, mmx_filter_shift ; 1.783 + psraw mm2, mmx_filter_shift ; 1.784 + 1.785 + movq mm3, [rdi] ; 1.786 + movq mm4, mm3 ; 1.787 + 1.788 + punpcklbw mm3, mm0 ; 1.789 + punpckhbw mm4, mm0 ; 1.790 + 1.791 + psubw mm1, mm3 ; 1.792 + psubw mm2, mm4 ; 1.793 + 1.794 + paddw mm6, mm1 ; 1.795 + pmaddwd mm1, mm1 ; 1.796 + 1.797 + paddw mm6, mm2 ; 1.798 + pmaddwd mm2, mm2 ; 1.799 + 1.800 + paddd mm7, mm1 ; 1.801 + paddd mm7, mm2 ; 1.802 + 1.803 +%if ABI_IS_32BIT 1.804 + add rsi, dword ptr arg(1) ;ref_pixels_per_line ; 1.805 + add rdi, dword ptr arg(3) ;src_pixels_per_line ; 1.806 +%else 1.807 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line ; 1.808 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line ; 1.809 + add rsi, r8 1.810 + add rdi, r9 1.811 +%endif 1.812 + sub rcx, 1 ; 1.813 + jnz .filter_block2d_bil_var_mmx_loop ; 1.814 + 1.815 + 1.816 + pxor mm3, mm3 ; 1.817 + pxor mm2, mm2 ; 1.818 + 1.819 + punpcklwd mm2, mm6 ; 1.820 + punpckhwd mm3, mm6 ; 1.821 + 1.822 + paddd mm2, mm3 ; 1.823 + movq mm6, mm2 ; 1.824 + 1.825 + psrlq mm6, 32 ; 1.826 + paddd mm2, mm6 ; 1.827 + 1.828 + psrad mm2, 16 ; 1.829 + movq mm4, mm7 ; 1.830 + 1.831 + psrlq mm4, 32 ; 1.832 + paddd mm4, mm7 ; 1.833 + 1.834 + mov rdi, arg(7) ;sum 1.835 + mov rsi, arg(8) ;sumsquared 1.836 + 1.837 + movd dword ptr [rdi], mm2 ; 1.838 + movd dword ptr [rsi], mm4 ; 1.839 + 1.840 + ; begin epilog 1.841 + add rsp, 16 1.842 + pop rdi 1.843 + pop rsi 1.844 + RESTORE_GOT 1.845 + UNSHADOW_ARGS 1.846 + pop rbp 1.847 + ret 1.848 + 1.849 + 1.850 +SECTION_RODATA 1.851 +;short mmx_bi_rd[4] = { 64, 64, 64, 64}; 1.852 +align 16 1.853 +mmx_bi_rd: 1.854 + times 4 dw 64