1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/recon_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1082 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void copy_mem16x16_sse2( 1.18 +; unsigned char *src, 1.19 +; int src_stride, 1.20 +; unsigned char *dst, 1.21 +; int dst_stride 1.22 +; ) 1.23 +global sym(vp8_copy_mem16x16_sse2) PRIVATE 1.24 +sym(vp8_copy_mem16x16_sse2): 1.25 + push rbp 1.26 + mov rbp, rsp 1.27 + SHADOW_ARGS_TO_STACK 4 1.28 + push rsi 1.29 + push rdi 1.30 + ; end prolog 1.31 + 1.32 + mov rsi, arg(0) ;src; 1.33 + movdqu xmm0, [rsi] 1.34 + 1.35 + movsxd rax, dword ptr arg(1) ;src_stride; 1.36 + mov rdi, arg(2) ;dst; 1.37 + 1.38 + movdqu xmm1, [rsi+rax] 1.39 + movdqu xmm2, [rsi+rax*2] 1.40 + 1.41 + movsxd rcx, dword ptr arg(3) ;dst_stride 1.42 + lea rsi, [rsi+rax*2] 1.43 + 1.44 + movdqa [rdi], xmm0 1.45 + add rsi, rax 1.46 + 1.47 + movdqa [rdi+rcx], xmm1 1.48 + movdqa [rdi+rcx*2],xmm2 1.49 + 1.50 + lea rdi, [rdi+rcx*2] 1.51 + movdqu xmm3, [rsi] 1.52 + 1.53 + add rdi, rcx 1.54 + movdqu xmm4, [rsi+rax] 1.55 + 1.56 + movdqu xmm5, [rsi+rax*2] 1.57 + lea rsi, [rsi+rax*2] 1.58 + 1.59 + movdqa [rdi], xmm3 1.60 + add rsi, rax 1.61 + 1.62 + movdqa [rdi+rcx], xmm4 1.63 + movdqa [rdi+rcx*2],xmm5 1.64 + 1.65 + lea rdi, [rdi+rcx*2] 1.66 + movdqu xmm0, [rsi] 1.67 + 1.68 + add rdi, rcx 1.69 + movdqu xmm1, [rsi+rax] 1.70 + 1.71 + movdqu xmm2, [rsi+rax*2] 1.72 + lea rsi, [rsi+rax*2] 1.73 + 1.74 + movdqa [rdi], xmm0 1.75 + add rsi, rax 1.76 + 1.77 + movdqa [rdi+rcx], xmm1 1.78 + 1.79 + movdqa [rdi+rcx*2], xmm2 1.80 + movdqu xmm3, [rsi] 1.81 + 1.82 + movdqu xmm4, [rsi+rax] 1.83 + lea rdi, [rdi+rcx*2] 1.84 + 1.85 + add rdi, rcx 1.86 + movdqu xmm5, [rsi+rax*2] 1.87 + 1.88 + lea rsi, [rsi+rax*2] 1.89 + movdqa [rdi], xmm3 1.90 + 1.91 + add rsi, rax 1.92 + movdqa [rdi+rcx], xmm4 1.93 + 1.94 + movdqa [rdi+rcx*2],xmm5 1.95 + movdqu xmm0, [rsi] 1.96 + 1.97 + lea rdi, [rdi+rcx*2] 1.98 + movdqu xmm1, [rsi+rax] 1.99 + 1.100 + add rdi, rcx 1.101 + movdqu xmm2, [rsi+rax*2] 1.102 + 1.103 + lea rsi, [rsi+rax*2] 1.104 + movdqa [rdi], xmm0 1.105 + 1.106 + movdqa [rdi+rcx], xmm1 1.107 + movdqa [rdi+rcx*2],xmm2 1.108 + 1.109 + movdqu xmm3, [rsi+rax] 1.110 + lea rdi, [rdi+rcx*2] 1.111 + 1.112 + movdqa [rdi+rcx], xmm3 1.113 + 1.114 + ; begin epilog 1.115 + pop rdi 1.116 + pop rsi 1.117 + UNSHADOW_ARGS 1.118 + pop rbp 1.119 + ret 1.120 + 1.121 + 1.122 +;void vp8_intra_pred_uv_dc_mmx2( 1.123 +; unsigned char *dst, 1.124 +; int dst_stride 1.125 +; unsigned char *above, 1.126 +; unsigned char *left, 1.127 +; int left_stride, 1.128 +; ) 1.129 +global sym(vp8_intra_pred_uv_dc_mmx2) PRIVATE 1.130 +sym(vp8_intra_pred_uv_dc_mmx2): 1.131 + push rbp 1.132 + mov rbp, rsp 1.133 + SHADOW_ARGS_TO_STACK 5 1.134 + push rsi 1.135 + push rdi 1.136 + ; end prolog 1.137 + 1.138 + ; from top 1.139 + mov rdi, arg(2) ;above; 1.140 + mov rsi, arg(3) ;left; 1.141 + movsxd rax, dword ptr arg(4) ;left_stride; 1.142 + pxor mm0, mm0 1.143 + movq mm1, [rdi] 1.144 + lea rdi, [rax*3] 1.145 + psadbw mm1, mm0 1.146 + ; from left 1.147 + movzx ecx, byte [rsi] 1.148 + movzx edx, byte [rsi+rax*1] 1.149 + add ecx, edx 1.150 + movzx edx, byte [rsi+rax*2] 1.151 + add ecx, edx 1.152 + 1.153 + movzx edx, byte [rsi+rdi] 1.154 + lea rsi, [rsi+rax*4] 1.155 + add ecx, edx 1.156 + movzx edx, byte [rsi] 1.157 + add ecx, edx 1.158 + movzx edx, byte [rsi+rax] 1.159 + add ecx, edx 1.160 + movzx edx, byte [rsi+rax*2] 1.161 + add ecx, edx 1.162 + movzx edx, byte [rsi+rdi] 1.163 + add ecx, edx 1.164 + 1.165 + ; add up 1.166 + pextrw edx, mm1, 0x0 1.167 + lea edx, [edx+ecx+8] 1.168 + sar edx, 4 1.169 + movd mm1, edx 1.170 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.171 + pshufw mm1, mm1, 0x0 1.172 + mov rdi, arg(0) ;dst; 1.173 + packuswb mm1, mm1 1.174 + 1.175 + ; write out 1.176 + lea rax, [rcx*3] 1.177 + lea rdx, [rdi+rcx*4] 1.178 + 1.179 + movq [rdi ], mm1 1.180 + movq [rdi+rcx ], mm1 1.181 + movq [rdi+rcx*2], mm1 1.182 + movq [rdi+rax ], mm1 1.183 + movq [rdx ], mm1 1.184 + movq [rdx+rcx ], mm1 1.185 + movq [rdx+rcx*2], mm1 1.186 + movq [rdx+rax ], mm1 1.187 + 1.188 + ; begin epilog 1.189 + pop rdi 1.190 + pop rsi 1.191 + UNSHADOW_ARGS 1.192 + pop rbp 1.193 + ret 1.194 + 1.195 +;void vp8_intra_pred_uv_dctop_mmx2( 1.196 +; unsigned char *dst, 1.197 +; int dst_stride 1.198 +; unsigned char *above, 1.199 +; unsigned char *left, 1.200 +; int left_stride, 1.201 +; ) 1.202 +global sym(vp8_intra_pred_uv_dctop_mmx2) PRIVATE 1.203 +sym(vp8_intra_pred_uv_dctop_mmx2): 1.204 + push rbp 1.205 + mov rbp, rsp 1.206 + SHADOW_ARGS_TO_STACK 5 1.207 + GET_GOT rbx 1.208 + push rsi 1.209 + push rdi 1.210 + ; end prolog 1.211 + 1.212 + ;arg(3), arg(4) not used 1.213 + 1.214 + ; from top 1.215 + mov rsi, arg(2) ;above; 1.216 + pxor mm0, mm0 1.217 + movq mm1, [rsi] 1.218 + psadbw mm1, mm0 1.219 + 1.220 + ; add up 1.221 + paddw mm1, [GLOBAL(dc_4)] 1.222 + psraw mm1, 3 1.223 + pshufw mm1, mm1, 0x0 1.224 + packuswb mm1, mm1 1.225 + 1.226 + ; write out 1.227 + mov rdi, arg(0) ;dst; 1.228 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.229 + lea rax, [rcx*3] 1.230 + 1.231 + movq [rdi ], mm1 1.232 + movq [rdi+rcx ], mm1 1.233 + movq [rdi+rcx*2], mm1 1.234 + movq [rdi+rax ], mm1 1.235 + lea rdi, [rdi+rcx*4] 1.236 + movq [rdi ], mm1 1.237 + movq [rdi+rcx ], mm1 1.238 + movq [rdi+rcx*2], mm1 1.239 + movq [rdi+rax ], mm1 1.240 + 1.241 + ; begin epilog 1.242 + pop rdi 1.243 + pop rsi 1.244 + RESTORE_GOT 1.245 + UNSHADOW_ARGS 1.246 + pop rbp 1.247 + ret 1.248 + 1.249 +;void vp8_intra_pred_uv_dcleft_mmx2( 1.250 +; unsigned char *dst, 1.251 +; int dst_stride 1.252 +; unsigned char *above, 1.253 +; unsigned char *left, 1.254 +; int left_stride, 1.255 +; ) 1.256 +global sym(vp8_intra_pred_uv_dcleft_mmx2) PRIVATE 1.257 +sym(vp8_intra_pred_uv_dcleft_mmx2): 1.258 + push rbp 1.259 + mov rbp, rsp 1.260 + SHADOW_ARGS_TO_STACK 5 1.261 + push rsi 1.262 + push rdi 1.263 + ; end prolog 1.264 + 1.265 + ;arg(2) not used 1.266 + 1.267 + ; from left 1.268 + mov rsi, arg(3) ;left; 1.269 + movsxd rax, dword ptr arg(4) ;left_stride; 1.270 + lea rdi, [rax*3] 1.271 + movzx ecx, byte [rsi] 1.272 + movzx edx, byte [rsi+rax] 1.273 + add ecx, edx 1.274 + movzx edx, byte [rsi+rax*2] 1.275 + add ecx, edx 1.276 + movzx edx, byte [rsi+rdi] 1.277 + add ecx, edx 1.278 + lea rsi, [rsi+rax*4] 1.279 + movzx edx, byte [rsi] 1.280 + add ecx, edx 1.281 + movzx edx, byte [rsi+rax] 1.282 + add ecx, edx 1.283 + movzx edx, byte [rsi+rax*2] 1.284 + add ecx, edx 1.285 + movzx edx, byte [rsi+rdi] 1.286 + lea edx, [ecx+edx+4] 1.287 + 1.288 + ; add up 1.289 + shr edx, 3 1.290 + movd mm1, edx 1.291 + pshufw mm1, mm1, 0x0 1.292 + packuswb mm1, mm1 1.293 + 1.294 + ; write out 1.295 + mov rdi, arg(0) ;dst; 1.296 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.297 + lea rax, [rcx*3] 1.298 + 1.299 + movq [rdi ], mm1 1.300 + movq [rdi+rcx ], mm1 1.301 + movq [rdi+rcx*2], mm1 1.302 + movq [rdi+rax ], mm1 1.303 + lea rdi, [rdi+rcx*4] 1.304 + movq [rdi ], mm1 1.305 + movq [rdi+rcx ], mm1 1.306 + movq [rdi+rcx*2], mm1 1.307 + movq [rdi+rax ], mm1 1.308 + 1.309 + ; begin epilog 1.310 + pop rdi 1.311 + pop rsi 1.312 + UNSHADOW_ARGS 1.313 + pop rbp 1.314 + ret 1.315 + 1.316 +;void vp8_intra_pred_uv_dc128_mmx( 1.317 +; unsigned char *dst, 1.318 +; int dst_stride 1.319 +; unsigned char *above, 1.320 +; unsigned char *left, 1.321 +; int left_stride, 1.322 +; ) 1.323 +global sym(vp8_intra_pred_uv_dc128_mmx) PRIVATE 1.324 +sym(vp8_intra_pred_uv_dc128_mmx): 1.325 + push rbp 1.326 + mov rbp, rsp 1.327 + SHADOW_ARGS_TO_STACK 5 1.328 + GET_GOT rbx 1.329 + ; end prolog 1.330 + 1.331 + ;arg(2), arg(3), arg(4) not used 1.332 + 1.333 + ; write out 1.334 + movq mm1, [GLOBAL(dc_128)] 1.335 + mov rax, arg(0) ;dst; 1.336 + movsxd rdx, dword ptr arg(1) ;dst_stride 1.337 + lea rcx, [rdx*3] 1.338 + 1.339 + movq [rax ], mm1 1.340 + movq [rax+rdx ], mm1 1.341 + movq [rax+rdx*2], mm1 1.342 + movq [rax+rcx ], mm1 1.343 + lea rax, [rax+rdx*4] 1.344 + movq [rax ], mm1 1.345 + movq [rax+rdx ], mm1 1.346 + movq [rax+rdx*2], mm1 1.347 + movq [rax+rcx ], mm1 1.348 + 1.349 + ; begin epilog 1.350 + RESTORE_GOT 1.351 + UNSHADOW_ARGS 1.352 + pop rbp 1.353 + ret 1.354 + 1.355 +;void vp8_intra_pred_uv_tm_sse2( 1.356 +; unsigned char *dst, 1.357 +; int dst_stride 1.358 +; unsigned char *above, 1.359 +; unsigned char *left, 1.360 +; int left_stride, 1.361 +; ) 1.362 +%macro vp8_intra_pred_uv_tm 1 1.363 +global sym(vp8_intra_pred_uv_tm_%1) PRIVATE 1.364 +sym(vp8_intra_pred_uv_tm_%1): 1.365 + push rbp 1.366 + mov rbp, rsp 1.367 + SHADOW_ARGS_TO_STACK 5 1.368 + GET_GOT rbx 1.369 + push rsi 1.370 + push rdi 1.371 + ; end prolog 1.372 + 1.373 + ; read top row 1.374 + mov edx, 4 1.375 + mov rsi, arg(2) ;above 1.376 + movsxd rax, dword ptr arg(4) ;left_stride; 1.377 + pxor xmm0, xmm0 1.378 +%ifidn %1, ssse3 1.379 + movdqa xmm2, [GLOBAL(dc_1024)] 1.380 +%endif 1.381 + movq xmm1, [rsi] 1.382 + punpcklbw xmm1, xmm0 1.383 + 1.384 + ; set up left ptrs ans subtract topleft 1.385 + movd xmm3, [rsi-1] 1.386 + mov rsi, arg(3) ;left; 1.387 +%ifidn %1, sse2 1.388 + punpcklbw xmm3, xmm0 1.389 + pshuflw xmm3, xmm3, 0x0 1.390 + punpcklqdq xmm3, xmm3 1.391 +%else 1.392 + pshufb xmm3, xmm2 1.393 +%endif 1.394 + psubw xmm1, xmm3 1.395 + 1.396 + ; set up dest ptrs 1.397 + mov rdi, arg(0) ;dst; 1.398 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.399 + 1.400 +.vp8_intra_pred_uv_tm_%1_loop: 1.401 + movd xmm3, [rsi] 1.402 + movd xmm5, [rsi+rax] 1.403 +%ifidn %1, sse2 1.404 + punpcklbw xmm3, xmm0 1.405 + punpcklbw xmm5, xmm0 1.406 + pshuflw xmm3, xmm3, 0x0 1.407 + pshuflw xmm5, xmm5, 0x0 1.408 + punpcklqdq xmm3, xmm3 1.409 + punpcklqdq xmm5, xmm5 1.410 +%else 1.411 + pshufb xmm3, xmm2 1.412 + pshufb xmm5, xmm2 1.413 +%endif 1.414 + paddw xmm3, xmm1 1.415 + paddw xmm5, xmm1 1.416 + packuswb xmm3, xmm5 1.417 + movq [rdi ], xmm3 1.418 + movhps[rdi+rcx], xmm3 1.419 + lea rsi, [rsi+rax*2] 1.420 + lea rdi, [rdi+rcx*2] 1.421 + dec edx 1.422 + jnz .vp8_intra_pred_uv_tm_%1_loop 1.423 + 1.424 + ; begin epilog 1.425 + pop rdi 1.426 + pop rsi 1.427 + RESTORE_GOT 1.428 + UNSHADOW_ARGS 1.429 + pop rbp 1.430 + ret 1.431 +%endmacro 1.432 + 1.433 +vp8_intra_pred_uv_tm sse2 1.434 +vp8_intra_pred_uv_tm ssse3 1.435 + 1.436 +;void vp8_intra_pred_uv_ve_mmx( 1.437 +; unsigned char *dst, 1.438 +; int dst_stride 1.439 +; unsigned char *above, 1.440 +; unsigned char *left, 1.441 +; int left_stride, 1.442 +; ) 1.443 +global sym(vp8_intra_pred_uv_ve_mmx) PRIVATE 1.444 +sym(vp8_intra_pred_uv_ve_mmx): 1.445 + push rbp 1.446 + mov rbp, rsp 1.447 + SHADOW_ARGS_TO_STACK 5 1.448 + ; end prolog 1.449 + 1.450 + ; arg(3), arg(4) not used 1.451 + 1.452 + ; read from top 1.453 + mov rax, arg(2) ;src; 1.454 + 1.455 + movq mm1, [rax] 1.456 + 1.457 + ; write out 1.458 + mov rax, arg(0) ;dst; 1.459 + movsxd rdx, dword ptr arg(1) ;dst_stride 1.460 + lea rcx, [rdx*3] 1.461 + 1.462 + movq [rax ], mm1 1.463 + movq [rax+rdx ], mm1 1.464 + movq [rax+rdx*2], mm1 1.465 + movq [rax+rcx ], mm1 1.466 + lea rax, [rax+rdx*4] 1.467 + movq [rax ], mm1 1.468 + movq [rax+rdx ], mm1 1.469 + movq [rax+rdx*2], mm1 1.470 + movq [rax+rcx ], mm1 1.471 + 1.472 + ; begin epilog 1.473 + UNSHADOW_ARGS 1.474 + pop rbp 1.475 + ret 1.476 + 1.477 +;void vp8_intra_pred_uv_ho_mmx2( 1.478 +; unsigned char *dst, 1.479 +; int dst_stride 1.480 +; unsigned char *above, 1.481 +; unsigned char *left, 1.482 +; int left_stride 1.483 +; ) 1.484 +%macro vp8_intra_pred_uv_ho 1 1.485 +global sym(vp8_intra_pred_uv_ho_%1) PRIVATE 1.486 +sym(vp8_intra_pred_uv_ho_%1): 1.487 + push rbp 1.488 + mov rbp, rsp 1.489 + SHADOW_ARGS_TO_STACK 5 1.490 + push rsi 1.491 + push rdi 1.492 +%ifidn %1, ssse3 1.493 +%ifndef GET_GOT_SAVE_ARG 1.494 + push rbx 1.495 +%endif 1.496 + GET_GOT rbx 1.497 +%endif 1.498 + ; end prolog 1.499 + 1.500 + ;arg(2) not used 1.501 + 1.502 + ; read from left and write out 1.503 +%ifidn %1, mmx2 1.504 + mov edx, 4 1.505 +%endif 1.506 + mov rsi, arg(3) ;left 1.507 + movsxd rax, dword ptr arg(4) ;left_stride; 1.508 + mov rdi, arg(0) ;dst; 1.509 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.510 +%ifidn %1, ssse3 1.511 + lea rdx, [rcx*3] 1.512 + movdqa xmm2, [GLOBAL(dc_00001111)] 1.513 + lea rbx, [rax*3] 1.514 +%endif 1.515 + 1.516 +%ifidn %1, mmx2 1.517 +.vp8_intra_pred_uv_ho_%1_loop: 1.518 + movd mm0, [rsi] 1.519 + movd mm1, [rsi+rax] 1.520 + punpcklbw mm0, mm0 1.521 + punpcklbw mm1, mm1 1.522 + pshufw mm0, mm0, 0x0 1.523 + pshufw mm1, mm1, 0x0 1.524 + movq [rdi ], mm0 1.525 + movq [rdi+rcx], mm1 1.526 + lea rsi, [rsi+rax*2] 1.527 + lea rdi, [rdi+rcx*2] 1.528 + dec edx 1.529 + jnz .vp8_intra_pred_uv_ho_%1_loop 1.530 +%else 1.531 + movd xmm0, [rsi] 1.532 + movd xmm3, [rsi+rax] 1.533 + movd xmm1, [rsi+rax*2] 1.534 + movd xmm4, [rsi+rbx] 1.535 + punpcklbw xmm0, xmm3 1.536 + punpcklbw xmm1, xmm4 1.537 + pshufb xmm0, xmm2 1.538 + pshufb xmm1, xmm2 1.539 + movq [rdi ], xmm0 1.540 + movhps [rdi+rcx], xmm0 1.541 + movq [rdi+rcx*2], xmm1 1.542 + movhps [rdi+rdx], xmm1 1.543 + lea rsi, [rsi+rax*4] 1.544 + lea rdi, [rdi+rcx*4] 1.545 + movd xmm0, [rsi] 1.546 + movd xmm3, [rsi+rax] 1.547 + movd xmm1, [rsi+rax*2] 1.548 + movd xmm4, [rsi+rbx] 1.549 + punpcklbw xmm0, xmm3 1.550 + punpcklbw xmm1, xmm4 1.551 + pshufb xmm0, xmm2 1.552 + pshufb xmm1, xmm2 1.553 + movq [rdi ], xmm0 1.554 + movhps [rdi+rcx], xmm0 1.555 + movq [rdi+rcx*2], xmm1 1.556 + movhps [rdi+rdx], xmm1 1.557 +%endif 1.558 + 1.559 + ; begin epilog 1.560 +%ifidn %1, ssse3 1.561 + RESTORE_GOT 1.562 +%ifndef GET_GOT_SAVE_ARG 1.563 + pop rbx 1.564 +%endif 1.565 +%endif 1.566 + pop rdi 1.567 + pop rsi 1.568 + UNSHADOW_ARGS 1.569 + pop rbp 1.570 + ret 1.571 +%endmacro 1.572 + 1.573 +vp8_intra_pred_uv_ho mmx2 1.574 +vp8_intra_pred_uv_ho ssse3 1.575 + 1.576 +;void vp8_intra_pred_y_dc_sse2( 1.577 +; unsigned char *dst, 1.578 +; int dst_stride 1.579 +; unsigned char *above, 1.580 +; unsigned char *left, 1.581 +; int left_stride 1.582 +; ) 1.583 +global sym(vp8_intra_pred_y_dc_sse2) PRIVATE 1.584 +sym(vp8_intra_pred_y_dc_sse2): 1.585 + push rbp 1.586 + mov rbp, rsp 1.587 + SHADOW_ARGS_TO_STACK 5 1.588 + push rsi 1.589 + push rdi 1.590 + ; end prolog 1.591 + 1.592 + ; from top 1.593 + mov rdi, arg(2) ;above 1.594 + mov rsi, arg(3) ;left 1.595 + movsxd rax, dword ptr arg(4) ;left_stride; 1.596 + 1.597 + pxor xmm0, xmm0 1.598 + movdqa xmm1, [rdi] 1.599 + psadbw xmm1, xmm0 1.600 + movq xmm2, xmm1 1.601 + punpckhqdq xmm1, xmm1 1.602 + paddw xmm1, xmm2 1.603 + 1.604 + ; from left 1.605 + lea rdi, [rax*3] 1.606 + 1.607 + movzx ecx, byte [rsi] 1.608 + movzx edx, byte [rsi+rax] 1.609 + add ecx, edx 1.610 + movzx edx, byte [rsi+rax*2] 1.611 + add ecx, edx 1.612 + movzx edx, byte [rsi+rdi] 1.613 + add ecx, edx 1.614 + lea rsi, [rsi+rax*4] 1.615 + 1.616 + movzx edx, byte [rsi] 1.617 + add ecx, edx 1.618 + movzx edx, byte [rsi+rax] 1.619 + add ecx, edx 1.620 + movzx edx, byte [rsi+rax*2] 1.621 + add ecx, edx 1.622 + movzx edx, byte [rsi+rdi] 1.623 + add ecx, edx 1.624 + lea rsi, [rsi+rax*4] 1.625 + 1.626 + movzx edx, byte [rsi] 1.627 + add ecx, edx 1.628 + movzx edx, byte [rsi+rax] 1.629 + add ecx, edx 1.630 + movzx edx, byte [rsi+rax*2] 1.631 + add ecx, edx 1.632 + movzx edx, byte [rsi+rdi] 1.633 + add ecx, edx 1.634 + lea rsi, [rsi+rax*4] 1.635 + 1.636 + movzx edx, byte [rsi] 1.637 + add ecx, edx 1.638 + movzx edx, byte [rsi+rax] 1.639 + add ecx, edx 1.640 + movzx edx, byte [rsi+rax*2] 1.641 + add ecx, edx 1.642 + movzx edx, byte [rsi+rdi] 1.643 + add ecx, edx 1.644 + 1.645 + ; add up 1.646 + pextrw edx, xmm1, 0x0 1.647 + lea edx, [edx+ecx+16] 1.648 + sar edx, 5 1.649 + movd xmm1, edx 1.650 + ; FIXME use pshufb for ssse3 version 1.651 + pshuflw xmm1, xmm1, 0x0 1.652 + punpcklqdq xmm1, xmm1 1.653 + packuswb xmm1, xmm1 1.654 + 1.655 + ; write out 1.656 + mov rsi, 2 1.657 + mov rdi, arg(0) ;dst; 1.658 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.659 + lea rax, [rcx*3] 1.660 + 1.661 +.label 1.662 + movdqa [rdi ], xmm1 1.663 + movdqa [rdi+rcx ], xmm1 1.664 + movdqa [rdi+rcx*2], xmm1 1.665 + movdqa [rdi+rax ], xmm1 1.666 + lea rdi, [rdi+rcx*4] 1.667 + movdqa [rdi ], xmm1 1.668 + movdqa [rdi+rcx ], xmm1 1.669 + movdqa [rdi+rcx*2], xmm1 1.670 + movdqa [rdi+rax ], xmm1 1.671 + lea rdi, [rdi+rcx*4] 1.672 + dec rsi 1.673 + jnz .label 1.674 + 1.675 + ; begin epilog 1.676 + pop rdi 1.677 + pop rsi 1.678 + UNSHADOW_ARGS 1.679 + pop rbp 1.680 + ret 1.681 + 1.682 +;void vp8_intra_pred_y_dctop_sse2( 1.683 +; unsigned char *dst, 1.684 +; int dst_stride 1.685 +; unsigned char *above, 1.686 +; unsigned char *left, 1.687 +; int left_stride 1.688 +; ) 1.689 +global sym(vp8_intra_pred_y_dctop_sse2) PRIVATE 1.690 +sym(vp8_intra_pred_y_dctop_sse2): 1.691 + push rbp 1.692 + mov rbp, rsp 1.693 + SHADOW_ARGS_TO_STACK 5 1.694 + push rsi 1.695 + GET_GOT rbx 1.696 + ; end prolog 1.697 + 1.698 + ;arg(3), arg(4) not used 1.699 + 1.700 + ; from top 1.701 + mov rcx, arg(2) ;above; 1.702 + pxor xmm0, xmm0 1.703 + movdqa xmm1, [rcx] 1.704 + psadbw xmm1, xmm0 1.705 + movdqa xmm2, xmm1 1.706 + punpckhqdq xmm1, xmm1 1.707 + paddw xmm1, xmm2 1.708 + 1.709 + ; add up 1.710 + paddw xmm1, [GLOBAL(dc_8)] 1.711 + psraw xmm1, 4 1.712 + ; FIXME use pshufb for ssse3 version 1.713 + pshuflw xmm1, xmm1, 0x0 1.714 + punpcklqdq xmm1, xmm1 1.715 + packuswb xmm1, xmm1 1.716 + 1.717 + ; write out 1.718 + mov rsi, 2 1.719 + mov rdx, arg(0) ;dst; 1.720 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.721 + lea rax, [rcx*3] 1.722 + 1.723 +.label 1.724 + movdqa [rdx ], xmm1 1.725 + movdqa [rdx+rcx ], xmm1 1.726 + movdqa [rdx+rcx*2], xmm1 1.727 + movdqa [rdx+rax ], xmm1 1.728 + lea rdx, [rdx+rcx*4] 1.729 + movdqa [rdx ], xmm1 1.730 + movdqa [rdx+rcx ], xmm1 1.731 + movdqa [rdx+rcx*2], xmm1 1.732 + movdqa [rdx+rax ], xmm1 1.733 + lea rdx, [rdx+rcx*4] 1.734 + dec rsi 1.735 + jnz .label 1.736 + 1.737 + ; begin epilog 1.738 + RESTORE_GOT 1.739 + pop rsi 1.740 + UNSHADOW_ARGS 1.741 + pop rbp 1.742 + ret 1.743 + 1.744 +;void vp8_intra_pred_y_dcleft_sse2( 1.745 +; unsigned char *dst, 1.746 +; int dst_stride 1.747 +; unsigned char *above, 1.748 +; unsigned char *left, 1.749 +; int left_stride 1.750 +; ) 1.751 +global sym(vp8_intra_pred_y_dcleft_sse2) PRIVATE 1.752 +sym(vp8_intra_pred_y_dcleft_sse2): 1.753 + push rbp 1.754 + mov rbp, rsp 1.755 + SHADOW_ARGS_TO_STACK 5 1.756 + push rsi 1.757 + push rdi 1.758 + ; end prolog 1.759 + 1.760 + ;arg(2) not used 1.761 + 1.762 + ; from left 1.763 + mov rsi, arg(3) ;left; 1.764 + movsxd rax, dword ptr arg(4) ;left_stride; 1.765 + 1.766 + lea rdi, [rax*3] 1.767 + movzx ecx, byte [rsi] 1.768 + movzx edx, byte [rsi+rax] 1.769 + add ecx, edx 1.770 + movzx edx, byte [rsi+rax*2] 1.771 + add ecx, edx 1.772 + movzx edx, byte [rsi+rdi] 1.773 + add ecx, edx 1.774 + lea rsi, [rsi+rax*4] 1.775 + movzx edx, byte [rsi] 1.776 + add ecx, edx 1.777 + movzx edx, byte [rsi+rax] 1.778 + add ecx, edx 1.779 + movzx edx, byte [rsi+rax*2] 1.780 + add ecx, edx 1.781 + movzx edx, byte [rsi+rdi] 1.782 + add ecx, edx 1.783 + lea rsi, [rsi+rax*4] 1.784 + movzx edx, byte [rsi] 1.785 + add ecx, edx 1.786 + movzx edx, byte [rsi+rax] 1.787 + add ecx, edx 1.788 + movzx edx, byte [rsi+rax*2] 1.789 + add ecx, edx 1.790 + movzx edx, byte [rsi+rdi] 1.791 + add ecx, edx 1.792 + lea rsi, [rsi+rax*4] 1.793 + movzx edx, byte [rsi] 1.794 + add ecx, edx 1.795 + movzx edx, byte [rsi+rax] 1.796 + add ecx, edx 1.797 + movzx edx, byte [rsi+rax*2] 1.798 + add ecx, edx 1.799 + movzx edx, byte [rsi+rdi] 1.800 + lea edx, [ecx+edx+8] 1.801 + 1.802 + ; add up 1.803 + shr edx, 4 1.804 + movd xmm1, edx 1.805 + ; FIXME use pshufb for ssse3 version 1.806 + pshuflw xmm1, xmm1, 0x0 1.807 + punpcklqdq xmm1, xmm1 1.808 + packuswb xmm1, xmm1 1.809 + 1.810 + ; write out 1.811 + mov rsi, 2 1.812 + mov rdi, arg(0) ;dst; 1.813 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.814 + lea rax, [rcx*3] 1.815 + 1.816 +.label 1.817 + movdqa [rdi ], xmm1 1.818 + movdqa [rdi+rcx ], xmm1 1.819 + movdqa [rdi+rcx*2], xmm1 1.820 + movdqa [rdi+rax ], xmm1 1.821 + lea rdi, [rdi+rcx*4] 1.822 + movdqa [rdi ], xmm1 1.823 + movdqa [rdi+rcx ], xmm1 1.824 + movdqa [rdi+rcx*2], xmm1 1.825 + movdqa [rdi+rax ], xmm1 1.826 + lea rdi, [rdi+rcx*4] 1.827 + dec rsi 1.828 + jnz .label 1.829 + 1.830 + ; begin epilog 1.831 + pop rdi 1.832 + pop rsi 1.833 + UNSHADOW_ARGS 1.834 + pop rbp 1.835 + ret 1.836 + 1.837 +;void vp8_intra_pred_y_dc128_sse2( 1.838 +; unsigned char *dst, 1.839 +; int dst_stride 1.840 +; unsigned char *above, 1.841 +; unsigned char *left, 1.842 +; int left_stride 1.843 +; ) 1.844 +global sym(vp8_intra_pred_y_dc128_sse2) PRIVATE 1.845 +sym(vp8_intra_pred_y_dc128_sse2): 1.846 + push rbp 1.847 + mov rbp, rsp 1.848 + SHADOW_ARGS_TO_STACK 5 1.849 + push rsi 1.850 + GET_GOT rbx 1.851 + ; end prolog 1.852 + 1.853 + ;arg(2), arg(3), arg(4) not used 1.854 + 1.855 + ; write out 1.856 + mov rsi, 2 1.857 + movdqa xmm1, [GLOBAL(dc_128)] 1.858 + mov rax, arg(0) ;dst; 1.859 + movsxd rdx, dword ptr arg(1) ;dst_stride 1.860 + lea rcx, [rdx*3] 1.861 + 1.862 +.label 1.863 + movdqa [rax ], xmm1 1.864 + movdqa [rax+rdx ], xmm1 1.865 + movdqa [rax+rdx*2], xmm1 1.866 + movdqa [rax+rcx ], xmm1 1.867 + lea rax, [rax+rdx*4] 1.868 + movdqa [rax ], xmm1 1.869 + movdqa [rax+rdx ], xmm1 1.870 + movdqa [rax+rdx*2], xmm1 1.871 + movdqa [rax+rcx ], xmm1 1.872 + lea rax, [rax+rdx*4] 1.873 + dec rsi 1.874 + jnz .label 1.875 + 1.876 + ; begin epilog 1.877 + RESTORE_GOT 1.878 + pop rsi 1.879 + UNSHADOW_ARGS 1.880 + pop rbp 1.881 + ret 1.882 + 1.883 +;void vp8_intra_pred_y_tm_sse2( 1.884 +; unsigned char *dst, 1.885 +; int dst_stride 1.886 +; unsigned char *above, 1.887 +; unsigned char *left, 1.888 +; int left_stride 1.889 +; ) 1.890 +%macro vp8_intra_pred_y_tm 1 1.891 +global sym(vp8_intra_pred_y_tm_%1) PRIVATE 1.892 +sym(vp8_intra_pred_y_tm_%1): 1.893 + push rbp 1.894 + mov rbp, rsp 1.895 + SHADOW_ARGS_TO_STACK 5 1.896 + SAVE_XMM 7 1.897 + push rsi 1.898 + push rdi 1.899 + GET_GOT rbx 1.900 + ; end prolog 1.901 + 1.902 + ; read top row 1.903 + mov edx, 8 1.904 + mov rsi, arg(2) ;above 1.905 + movsxd rax, dword ptr arg(4) ;left_stride; 1.906 + pxor xmm0, xmm0 1.907 +%ifidn %1, ssse3 1.908 + movdqa xmm3, [GLOBAL(dc_1024)] 1.909 +%endif 1.910 + movdqa xmm1, [rsi] 1.911 + movdqa xmm2, xmm1 1.912 + punpcklbw xmm1, xmm0 1.913 + punpckhbw xmm2, xmm0 1.914 + 1.915 + ; set up left ptrs ans subtract topleft 1.916 + movd xmm4, [rsi-1] 1.917 + mov rsi, arg(3) ;left 1.918 +%ifidn %1, sse2 1.919 + punpcklbw xmm4, xmm0 1.920 + pshuflw xmm4, xmm4, 0x0 1.921 + punpcklqdq xmm4, xmm4 1.922 +%else 1.923 + pshufb xmm4, xmm3 1.924 +%endif 1.925 + psubw xmm1, xmm4 1.926 + psubw xmm2, xmm4 1.927 + 1.928 + ; set up dest ptrs 1.929 + mov rdi, arg(0) ;dst; 1.930 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.931 +vp8_intra_pred_y_tm_%1_loop: 1.932 + movd xmm4, [rsi] 1.933 + movd xmm5, [rsi+rax] 1.934 +%ifidn %1, sse2 1.935 + punpcklbw xmm4, xmm0 1.936 + punpcklbw xmm5, xmm0 1.937 + pshuflw xmm4, xmm4, 0x0 1.938 + pshuflw xmm5, xmm5, 0x0 1.939 + punpcklqdq xmm4, xmm4 1.940 + punpcklqdq xmm5, xmm5 1.941 +%else 1.942 + pshufb xmm4, xmm3 1.943 + pshufb xmm5, xmm3 1.944 +%endif 1.945 + movdqa xmm6, xmm4 1.946 + movdqa xmm7, xmm5 1.947 + paddw xmm4, xmm1 1.948 + paddw xmm6, xmm2 1.949 + paddw xmm5, xmm1 1.950 + paddw xmm7, xmm2 1.951 + packuswb xmm4, xmm6 1.952 + packuswb xmm5, xmm7 1.953 + movdqa [rdi ], xmm4 1.954 + movdqa [rdi+rcx], xmm5 1.955 + lea rsi, [rsi+rax*2] 1.956 + lea rdi, [rdi+rcx*2] 1.957 + dec edx 1.958 + jnz vp8_intra_pred_y_tm_%1_loop 1.959 + 1.960 + ; begin epilog 1.961 + RESTORE_GOT 1.962 + pop rdi 1.963 + pop rsi 1.964 + RESTORE_XMM 1.965 + UNSHADOW_ARGS 1.966 + pop rbp 1.967 + ret 1.968 +%endmacro 1.969 + 1.970 +vp8_intra_pred_y_tm sse2 1.971 +vp8_intra_pred_y_tm ssse3 1.972 + 1.973 +;void vp8_intra_pred_y_ve_sse2( 1.974 +; unsigned char *dst, 1.975 +; int dst_stride 1.976 +; unsigned char *above, 1.977 +; unsigned char *left, 1.978 +; int left_stride 1.979 +; ) 1.980 +global sym(vp8_intra_pred_y_ve_sse2) PRIVATE 1.981 +sym(vp8_intra_pred_y_ve_sse2): 1.982 + push rbp 1.983 + mov rbp, rsp 1.984 + SHADOW_ARGS_TO_STACK 5 1.985 + push rsi 1.986 + ; end prolog 1.987 + 1.988 + ;arg(3), arg(4) not used 1.989 + 1.990 + mov rax, arg(2) ;above; 1.991 + mov rsi, 2 1.992 + movsxd rdx, dword ptr arg(1) ;dst_stride 1.993 + 1.994 + ; read from top 1.995 + movdqa xmm1, [rax] 1.996 + 1.997 + ; write out 1.998 + mov rax, arg(0) ;dst; 1.999 + lea rcx, [rdx*3] 1.1000 + 1.1001 +.label 1.1002 + movdqa [rax ], xmm1 1.1003 + movdqa [rax+rdx ], xmm1 1.1004 + movdqa [rax+rdx*2], xmm1 1.1005 + movdqa [rax+rcx ], xmm1 1.1006 + lea rax, [rax+rdx*4] 1.1007 + movdqa [rax ], xmm1 1.1008 + movdqa [rax+rdx ], xmm1 1.1009 + movdqa [rax+rdx*2], xmm1 1.1010 + movdqa [rax+rcx ], xmm1 1.1011 + lea rax, [rax+rdx*4] 1.1012 + dec rsi 1.1013 + jnz .label 1.1014 + 1.1015 + ; begin epilog 1.1016 + pop rsi 1.1017 + UNSHADOW_ARGS 1.1018 + pop rbp 1.1019 + ret 1.1020 + 1.1021 +;void vp8_intra_pred_y_ho_sse2( 1.1022 +; unsigned char *dst, 1.1023 +; int dst_stride 1.1024 +; unsigned char *above, 1.1025 +; unsigned char *left, 1.1026 +; int left_stride, 1.1027 +; ) 1.1028 +global sym(vp8_intra_pred_y_ho_sse2) PRIVATE 1.1029 +sym(vp8_intra_pred_y_ho_sse2): 1.1030 + push rbp 1.1031 + mov rbp, rsp 1.1032 + SHADOW_ARGS_TO_STACK 5 1.1033 + push rsi 1.1034 + push rdi 1.1035 + ; end prolog 1.1036 + 1.1037 + ;arg(2) not used 1.1038 + 1.1039 + ; read from left and write out 1.1040 + mov edx, 8 1.1041 + mov rsi, arg(3) ;left; 1.1042 + movsxd rax, dword ptr arg(4) ;left_stride; 1.1043 + mov rdi, arg(0) ;dst; 1.1044 + movsxd rcx, dword ptr arg(1) ;dst_stride 1.1045 + 1.1046 +vp8_intra_pred_y_ho_sse2_loop: 1.1047 + movd xmm0, [rsi] 1.1048 + movd xmm1, [rsi+rax] 1.1049 + ; FIXME use pshufb for ssse3 version 1.1050 + punpcklbw xmm0, xmm0 1.1051 + punpcklbw xmm1, xmm1 1.1052 + pshuflw xmm0, xmm0, 0x0 1.1053 + pshuflw xmm1, xmm1, 0x0 1.1054 + punpcklqdq xmm0, xmm0 1.1055 + punpcklqdq xmm1, xmm1 1.1056 + movdqa [rdi ], xmm0 1.1057 + movdqa [rdi+rcx], xmm1 1.1058 + lea rsi, [rsi+rax*2] 1.1059 + lea rdi, [rdi+rcx*2] 1.1060 + dec edx 1.1061 + jnz vp8_intra_pred_y_ho_sse2_loop 1.1062 + 1.1063 + ; begin epilog 1.1064 + pop rdi 1.1065 + pop rsi 1.1066 + UNSHADOW_ARGS 1.1067 + pop rbp 1.1068 + ret 1.1069 + 1.1070 +SECTION_RODATA 1.1071 +align 16 1.1072 +dc_128: 1.1073 + times 16 db 128 1.1074 +dc_4: 1.1075 + times 4 dw 4 1.1076 +align 16 1.1077 +dc_8: 1.1078 + times 8 dw 8 1.1079 +align 16 1.1080 +dc_1024: 1.1081 + times 8 dw 0x400 1.1082 +align 16 1.1083 +dc_00001111: 1.1084 + times 8 db 0 1.1085 + times 8 db 1