1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_ssse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1060 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%macro VERTx4 1 1.18 + mov rdx, arg(5) ;filter ptr 1.19 + mov rsi, arg(0) ;src_ptr 1.20 + mov rdi, arg(2) ;output_ptr 1.21 + mov rcx, 0x0400040 1.22 + 1.23 + movdqa xmm4, [rdx] ;load filters 1.24 + movd xmm5, rcx 1.25 + packsswb xmm4, xmm4 1.26 + pshuflw xmm0, xmm4, 0b ;k0_k1 1.27 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 1.28 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 1.29 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 1.30 + 1.31 + punpcklqdq xmm0, xmm0 1.32 + punpcklqdq xmm1, xmm1 1.33 + punpcklqdq xmm2, xmm2 1.34 + punpcklqdq xmm3, xmm3 1.35 + 1.36 + movdqa k0k1, xmm0 1.37 + movdqa k2k3, xmm1 1.38 + pshufd xmm5, xmm5, 0 1.39 + movdqa k4k5, xmm2 1.40 + movdqa k6k7, xmm3 1.41 + movdqa krd, xmm5 1.42 + 1.43 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.44 + 1.45 +%if ABI_IS_32BIT=0 1.46 + movsxd r8, DWORD PTR arg(3) ;out_pitch 1.47 +%endif 1.48 + mov rax, rsi 1.49 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.50 + add rax, rdx 1.51 + 1.52 + lea rbx, [rdx + rdx*4] 1.53 + add rbx, rdx ;pitch * 6 1.54 + 1.55 +.loop: 1.56 + movd xmm0, [rsi] ;A 1.57 + movd xmm1, [rsi + rdx] ;B 1.58 + movd xmm2, [rsi + rdx * 2] ;C 1.59 + movd xmm3, [rax + rdx * 2] ;D 1.60 + movd xmm4, [rsi + rdx * 4] ;E 1.61 + movd xmm5, [rax + rdx * 4] ;F 1.62 + 1.63 + punpcklbw xmm0, xmm1 ;A B 1.64 + punpcklbw xmm2, xmm3 ;C D 1.65 + punpcklbw xmm4, xmm5 ;E F 1.66 + 1.67 + movd xmm6, [rsi + rbx] ;G 1.68 + movd xmm7, [rax + rbx] ;H 1.69 + 1.70 + pmaddubsw xmm0, k0k1 1.71 + pmaddubsw xmm2, k2k3 1.72 + punpcklbw xmm6, xmm7 ;G H 1.73 + pmaddubsw xmm4, k4k5 1.74 + pmaddubsw xmm6, k6k7 1.75 + 1.76 + movdqa xmm1, xmm2 1.77 + paddsw xmm0, xmm6 1.78 + pmaxsw xmm2, xmm4 1.79 + pminsw xmm4, xmm1 1.80 + paddsw xmm0, xmm4 1.81 + paddsw xmm0, xmm2 1.82 + 1.83 + paddsw xmm0, krd 1.84 + psraw xmm0, 7 1.85 + packuswb xmm0, xmm0 1.86 + 1.87 + add rsi, rdx 1.88 + add rax, rdx 1.89 +%if %1 1.90 + movd xmm1, [rdi] 1.91 + pavgb xmm0, xmm1 1.92 +%endif 1.93 + movd [rdi], xmm0 1.94 + 1.95 +%if ABI_IS_32BIT 1.96 + add rdi, DWORD PTR arg(3) ;out_pitch 1.97 +%else 1.98 + add rdi, r8 1.99 +%endif 1.100 + dec rcx 1.101 + jnz .loop 1.102 +%endm 1.103 + 1.104 +%macro VERTx8 1 1.105 + mov rdx, arg(5) ;filter ptr 1.106 + mov rsi, arg(0) ;src_ptr 1.107 + mov rdi, arg(2) ;output_ptr 1.108 + mov rcx, 0x0400040 1.109 + 1.110 + movdqa xmm4, [rdx] ;load filters 1.111 + movq xmm5, rcx 1.112 + packsswb xmm4, xmm4 1.113 + pshuflw xmm0, xmm4, 0b ;k0_k1 1.114 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 1.115 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 1.116 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 1.117 + 1.118 + punpcklqdq xmm0, xmm0 1.119 + punpcklqdq xmm1, xmm1 1.120 + punpcklqdq xmm2, xmm2 1.121 + punpcklqdq xmm3, xmm3 1.122 + 1.123 + movdqa k0k1, xmm0 1.124 + movdqa k2k3, xmm1 1.125 + pshufd xmm5, xmm5, 0 1.126 + movdqa k4k5, xmm2 1.127 + movdqa k6k7, xmm3 1.128 + movdqa krd, xmm5 1.129 + 1.130 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.131 + 1.132 +%if ABI_IS_32BIT=0 1.133 + movsxd r8, DWORD PTR arg(3) ;out_pitch 1.134 +%endif 1.135 + mov rax, rsi 1.136 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.137 + add rax, rdx 1.138 + 1.139 + lea rbx, [rdx + rdx*4] 1.140 + add rbx, rdx ;pitch * 6 1.141 + 1.142 +.loop: 1.143 + movq xmm0, [rsi] ;A 1.144 + movq xmm1, [rsi + rdx] ;B 1.145 + movq xmm2, [rsi + rdx * 2] ;C 1.146 + movq xmm3, [rax + rdx * 2] ;D 1.147 + movq xmm4, [rsi + rdx * 4] ;E 1.148 + movq xmm5, [rax + rdx * 4] ;F 1.149 + 1.150 + punpcklbw xmm0, xmm1 ;A B 1.151 + punpcklbw xmm2, xmm3 ;C D 1.152 + punpcklbw xmm4, xmm5 ;E F 1.153 + 1.154 + movq xmm6, [rsi + rbx] ;G 1.155 + movq xmm7, [rax + rbx] ;H 1.156 + 1.157 + pmaddubsw xmm0, k0k1 1.158 + pmaddubsw xmm2, k2k3 1.159 + punpcklbw xmm6, xmm7 ;G H 1.160 + pmaddubsw xmm4, k4k5 1.161 + pmaddubsw xmm6, k6k7 1.162 + 1.163 + paddsw xmm0, xmm6 1.164 + movdqa xmm1, xmm2 1.165 + pmaxsw xmm2, xmm4 1.166 + pminsw xmm4, xmm1 1.167 + paddsw xmm0, xmm4 1.168 + paddsw xmm0, xmm2 1.169 + 1.170 + paddsw xmm0, krd 1.171 + psraw xmm0, 7 1.172 + packuswb xmm0, xmm0 1.173 + 1.174 + add rsi, rdx 1.175 + add rax, rdx 1.176 +%if %1 1.177 + movq xmm1, [rdi] 1.178 + pavgb xmm0, xmm1 1.179 +%endif 1.180 + movq [rdi], xmm0 1.181 + 1.182 +%if ABI_IS_32BIT 1.183 + add rdi, DWORD PTR arg(3) ;out_pitch 1.184 +%else 1.185 + add rdi, r8 1.186 +%endif 1.187 + dec rcx 1.188 + jnz .loop 1.189 +%endm 1.190 + 1.191 + 1.192 +%macro VERTx16 1 1.193 + mov rdx, arg(5) ;filter ptr 1.194 + mov rsi, arg(0) ;src_ptr 1.195 + mov rdi, arg(2) ;output_ptr 1.196 + mov rcx, 0x0400040 1.197 + 1.198 + movdqa xmm4, [rdx] ;load filters 1.199 + movq xmm5, rcx 1.200 + packsswb xmm4, xmm4 1.201 + pshuflw xmm0, xmm4, 0b ;k0_k1 1.202 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 1.203 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 1.204 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 1.205 + 1.206 + punpcklqdq xmm0, xmm0 1.207 + punpcklqdq xmm1, xmm1 1.208 + punpcklqdq xmm2, xmm2 1.209 + punpcklqdq xmm3, xmm3 1.210 + 1.211 + movdqa k0k1, xmm0 1.212 + movdqa k2k3, xmm1 1.213 + pshufd xmm5, xmm5, 0 1.214 + movdqa k4k5, xmm2 1.215 + movdqa k6k7, xmm3 1.216 + movdqa krd, xmm5 1.217 + 1.218 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.219 + 1.220 +%if ABI_IS_32BIT=0 1.221 + movsxd r8, DWORD PTR arg(3) ;out_pitch 1.222 +%endif 1.223 + mov rax, rsi 1.224 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.225 + add rax, rdx 1.226 + 1.227 + lea rbx, [rdx + rdx*4] 1.228 + add rbx, rdx ;pitch * 6 1.229 + 1.230 +.loop: 1.231 + movq xmm0, [rsi] ;A 1.232 + movq xmm1, [rsi + rdx] ;B 1.233 + movq xmm2, [rsi + rdx * 2] ;C 1.234 + movq xmm3, [rax + rdx * 2] ;D 1.235 + movq xmm4, [rsi + rdx * 4] ;E 1.236 + movq xmm5, [rax + rdx * 4] ;F 1.237 + 1.238 + punpcklbw xmm0, xmm1 ;A B 1.239 + punpcklbw xmm2, xmm3 ;C D 1.240 + punpcklbw xmm4, xmm5 ;E F 1.241 + 1.242 + movq xmm6, [rsi + rbx] ;G 1.243 + movq xmm7, [rax + rbx] ;H 1.244 + 1.245 + pmaddubsw xmm0, k0k1 1.246 + pmaddubsw xmm2, k2k3 1.247 + punpcklbw xmm6, xmm7 ;G H 1.248 + pmaddubsw xmm4, k4k5 1.249 + pmaddubsw xmm6, k6k7 1.250 + 1.251 + paddsw xmm0, xmm6 1.252 + movdqa xmm1, xmm2 1.253 + pmaxsw xmm2, xmm4 1.254 + pminsw xmm4, xmm1 1.255 + paddsw xmm0, xmm4 1.256 + paddsw xmm0, xmm2 1.257 + 1.258 + paddsw xmm0, krd 1.259 + psraw xmm0, 7 1.260 + packuswb xmm0, xmm0 1.261 +%if %1 1.262 + movq xmm1, [rdi] 1.263 + pavgb xmm0, xmm1 1.264 +%endif 1.265 + movq [rdi], xmm0 1.266 + 1.267 + movq xmm0, [rsi + 8] ;A 1.268 + movq xmm1, [rsi + rdx + 8] ;B 1.269 + movq xmm2, [rsi + rdx * 2 + 8] ;C 1.270 + movq xmm3, [rax + rdx * 2 + 8] ;D 1.271 + movq xmm4, [rsi + rdx * 4 + 8] ;E 1.272 + movq xmm5, [rax + rdx * 4 + 8] ;F 1.273 + 1.274 + punpcklbw xmm0, xmm1 ;A B 1.275 + punpcklbw xmm2, xmm3 ;C D 1.276 + punpcklbw xmm4, xmm5 ;E F 1.277 + 1.278 + 1.279 + movq xmm6, [rsi + rbx + 8] ;G 1.280 + movq xmm7, [rax + rbx + 8] ;H 1.281 + punpcklbw xmm6, xmm7 ;G H 1.282 + 1.283 + 1.284 + pmaddubsw xmm0, k0k1 1.285 + pmaddubsw xmm2, k2k3 1.286 + pmaddubsw xmm4, k4k5 1.287 + pmaddubsw xmm6, k6k7 1.288 + 1.289 + paddsw xmm0, xmm6 1.290 + paddsw xmm0, xmm2 1.291 + paddsw xmm0, xmm4 1.292 + paddsw xmm0, krd 1.293 + 1.294 + psraw xmm0, 7 1.295 + packuswb xmm0, xmm0 1.296 + 1.297 + add rsi, rdx 1.298 + add rax, rdx 1.299 +%if %1 1.300 + movq xmm1, [rdi+8] 1.301 + pavgb xmm0, xmm1 1.302 +%endif 1.303 + 1.304 + movq [rdi+8], xmm0 1.305 + 1.306 +%if ABI_IS_32BIT 1.307 + add rdi, DWORD PTR arg(3) ;out_pitch 1.308 +%else 1.309 + add rdi, r8 1.310 +%endif 1.311 + dec rcx 1.312 + jnz .loop 1.313 +%endm 1.314 + 1.315 +;void vp9_filter_block1d8_v8_ssse3 1.316 +;( 1.317 +; unsigned char *src_ptr, 1.318 +; unsigned int src_pitch, 1.319 +; unsigned char *output_ptr, 1.320 +; unsigned int out_pitch, 1.321 +; unsigned int output_height, 1.322 +; short *filter 1.323 +;) 1.324 +global sym(vp9_filter_block1d4_v8_ssse3) PRIVATE 1.325 +sym(vp9_filter_block1d4_v8_ssse3): 1.326 + push rbp 1.327 + mov rbp, rsp 1.328 + SHADOW_ARGS_TO_STACK 6 1.329 + SAVE_XMM 7 1.330 + push rsi 1.331 + push rdi 1.332 + push rbx 1.333 + ; end prolog 1.334 + 1.335 + ALIGN_STACK 16, rax 1.336 + sub rsp, 16*5 1.337 + %define k0k1 [rsp + 16*0] 1.338 + %define k2k3 [rsp + 16*1] 1.339 + %define k4k5 [rsp + 16*2] 1.340 + %define k6k7 [rsp + 16*3] 1.341 + %define krd [rsp + 16*4] 1.342 + 1.343 + VERTx4 0 1.344 + 1.345 + add rsp, 16*5 1.346 + pop rsp 1.347 + pop rbx 1.348 + ; begin epilog 1.349 + pop rdi 1.350 + pop rsi 1.351 + RESTORE_XMM 1.352 + UNSHADOW_ARGS 1.353 + pop rbp 1.354 + ret 1.355 + 1.356 +;void vp9_filter_block1d8_v8_ssse3 1.357 +;( 1.358 +; unsigned char *src_ptr, 1.359 +; unsigned int src_pitch, 1.360 +; unsigned char *output_ptr, 1.361 +; unsigned int out_pitch, 1.362 +; unsigned int output_height, 1.363 +; short *filter 1.364 +;) 1.365 +global sym(vp9_filter_block1d8_v8_ssse3) PRIVATE 1.366 +sym(vp9_filter_block1d8_v8_ssse3): 1.367 + push rbp 1.368 + mov rbp, rsp 1.369 + SHADOW_ARGS_TO_STACK 6 1.370 + SAVE_XMM 7 1.371 + push rsi 1.372 + push rdi 1.373 + push rbx 1.374 + ; end prolog 1.375 + 1.376 + ALIGN_STACK 16, rax 1.377 + sub rsp, 16*5 1.378 + %define k0k1 [rsp + 16*0] 1.379 + %define k2k3 [rsp + 16*1] 1.380 + %define k4k5 [rsp + 16*2] 1.381 + %define k6k7 [rsp + 16*3] 1.382 + %define krd [rsp + 16*4] 1.383 + 1.384 + VERTx8 0 1.385 + 1.386 + add rsp, 16*5 1.387 + pop rsp 1.388 + pop rbx 1.389 + ; begin epilog 1.390 + pop rdi 1.391 + pop rsi 1.392 + RESTORE_XMM 1.393 + UNSHADOW_ARGS 1.394 + pop rbp 1.395 + ret 1.396 + 1.397 +;void vp9_filter_block1d16_v8_ssse3 1.398 +;( 1.399 +; unsigned char *src_ptr, 1.400 +; unsigned int src_pitch, 1.401 +; unsigned char *output_ptr, 1.402 +; unsigned int out_pitch, 1.403 +; unsigned int output_height, 1.404 +; short *filter 1.405 +;) 1.406 +global sym(vp9_filter_block1d16_v8_ssse3) PRIVATE 1.407 +sym(vp9_filter_block1d16_v8_ssse3): 1.408 + push rbp 1.409 + mov rbp, rsp 1.410 + SHADOW_ARGS_TO_STACK 6 1.411 + SAVE_XMM 7 1.412 + push rsi 1.413 + push rdi 1.414 + push rbx 1.415 + ; end prolog 1.416 + 1.417 + ALIGN_STACK 16, rax 1.418 + sub rsp, 16*5 1.419 + %define k0k1 [rsp + 16*0] 1.420 + %define k2k3 [rsp + 16*1] 1.421 + %define k4k5 [rsp + 16*2] 1.422 + %define k6k7 [rsp + 16*3] 1.423 + %define krd [rsp + 16*4] 1.424 + 1.425 + VERTx16 0 1.426 + 1.427 + add rsp, 16*5 1.428 + pop rsp 1.429 + pop rbx 1.430 + ; begin epilog 1.431 + pop rdi 1.432 + pop rsi 1.433 + RESTORE_XMM 1.434 + UNSHADOW_ARGS 1.435 + pop rbp 1.436 + ret 1.437 + 1.438 +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1.439 + 1.440 + 1.441 +global sym(vp9_filter_block1d4_v8_avg_ssse3) PRIVATE 1.442 +sym(vp9_filter_block1d4_v8_avg_ssse3): 1.443 + push rbp 1.444 + mov rbp, rsp 1.445 + SHADOW_ARGS_TO_STACK 6 1.446 + SAVE_XMM 7 1.447 + push rsi 1.448 + push rdi 1.449 + push rbx 1.450 + ; end prolog 1.451 + 1.452 + ALIGN_STACK 16, rax 1.453 + sub rsp, 16*5 1.454 + %define k0k1 [rsp + 16*0] 1.455 + %define k2k3 [rsp + 16*1] 1.456 + %define k4k5 [rsp + 16*2] 1.457 + %define k6k7 [rsp + 16*3] 1.458 + %define krd [rsp + 16*4] 1.459 + 1.460 + VERTx4 1 1.461 + 1.462 + add rsp, 16*5 1.463 + pop rsp 1.464 + pop rbx 1.465 + ; begin epilog 1.466 + pop rdi 1.467 + pop rsi 1.468 + RESTORE_XMM 1.469 + UNSHADOW_ARGS 1.470 + pop rbp 1.471 + ret 1.472 + 1.473 +global sym(vp9_filter_block1d8_v8_avg_ssse3) PRIVATE 1.474 +sym(vp9_filter_block1d8_v8_avg_ssse3): 1.475 + push rbp 1.476 + mov rbp, rsp 1.477 + SHADOW_ARGS_TO_STACK 6 1.478 + SAVE_XMM 7 1.479 + push rsi 1.480 + push rdi 1.481 + push rbx 1.482 + ; end prolog 1.483 + 1.484 + ALIGN_STACK 16, rax 1.485 + sub rsp, 16*5 1.486 + %define k0k1 [rsp + 16*0] 1.487 + %define k2k3 [rsp + 16*1] 1.488 + %define k4k5 [rsp + 16*2] 1.489 + %define k6k7 [rsp + 16*3] 1.490 + %define krd [rsp + 16*4] 1.491 + 1.492 + VERTx8 1 1.493 + 1.494 + add rsp, 16*5 1.495 + pop rsp 1.496 + pop rbx 1.497 + ; begin epilog 1.498 + pop rdi 1.499 + pop rsi 1.500 + RESTORE_XMM 1.501 + UNSHADOW_ARGS 1.502 + pop rbp 1.503 + ret 1.504 + 1.505 +global sym(vp9_filter_block1d16_v8_avg_ssse3) PRIVATE 1.506 +sym(vp9_filter_block1d16_v8_avg_ssse3): 1.507 + push rbp 1.508 + mov rbp, rsp 1.509 + SHADOW_ARGS_TO_STACK 6 1.510 + SAVE_XMM 7 1.511 + push rsi 1.512 + push rdi 1.513 + push rbx 1.514 + ; end prolog 1.515 + 1.516 + ALIGN_STACK 16, rax 1.517 + sub rsp, 16*5 1.518 + %define k0k1 [rsp + 16*0] 1.519 + %define k2k3 [rsp + 16*1] 1.520 + %define k4k5 [rsp + 16*2] 1.521 + %define k6k7 [rsp + 16*3] 1.522 + %define krd [rsp + 16*4] 1.523 + 1.524 + VERTx16 1 1.525 + 1.526 + add rsp, 16*5 1.527 + pop rsp 1.528 + pop rbx 1.529 + ; begin epilog 1.530 + pop rdi 1.531 + pop rsi 1.532 + RESTORE_XMM 1.533 + UNSHADOW_ARGS 1.534 + pop rbp 1.535 + ret 1.536 + 1.537 +;~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ 1.538 +%macro HORIZx4_ROW 2 1.539 + movdqa %2, %1 1.540 + pshufb %1, [GLOBAL(shuf_t0t1)] 1.541 + pshufb %2, [GLOBAL(shuf_t2t3)] 1.542 + pmaddubsw %1, k0k1k4k5 1.543 + pmaddubsw %2, k2k3k6k7 1.544 + 1.545 + movdqa xmm4, %1 1.546 + movdqa xmm5, %2 1.547 + psrldq %1, 8 1.548 + psrldq %2, 8 1.549 + movdqa xmm6, xmm5 1.550 + 1.551 + paddsw xmm4, %2 1.552 + pmaxsw xmm5, %1 1.553 + pminsw %1, xmm6 1.554 + paddsw %1, xmm4 1.555 + paddsw %1, xmm5 1.556 + 1.557 + paddsw %1, krd 1.558 + psraw %1, 7 1.559 + packuswb %1, %1 1.560 +%endm 1.561 + 1.562 +%macro HORIZx4 1 1.563 + mov rdx, arg(5) ;filter ptr 1.564 + mov rsi, arg(0) ;src_ptr 1.565 + mov rdi, arg(2) ;output_ptr 1.566 + mov rcx, 0x0400040 1.567 + 1.568 + movdqa xmm4, [rdx] ;load filters 1.569 + movq xmm5, rcx 1.570 + packsswb xmm4, xmm4 1.571 + pshuflw xmm6, xmm4, 0b ;k0_k1 1.572 + pshufhw xmm6, xmm6, 10101010b ;k0_k1_k4_k5 1.573 + pshuflw xmm7, xmm4, 01010101b ;k2_k3 1.574 + pshufhw xmm7, xmm7, 11111111b ;k2_k3_k6_k7 1.575 + pshufd xmm5, xmm5, 0 ;rounding 1.576 + 1.577 + movdqa k0k1k4k5, xmm6 1.578 + movdqa k2k3k6k7, xmm7 1.579 + movdqa krd, xmm5 1.580 + 1.581 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.582 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.583 + movsxd rcx, dword ptr arg(4) ;output_height 1.584 + shr rcx, 1 1.585 +.loop: 1.586 + ;Do two rows once 1.587 + movq xmm0, [rsi - 3] ;load src 1.588 + movq xmm1, [rsi + 5] 1.589 + movq xmm2, [rsi + rax - 3] 1.590 + movq xmm3, [rsi + rax + 5] 1.591 + punpcklqdq xmm0, xmm1 1.592 + punpcklqdq xmm2, xmm3 1.593 + 1.594 + HORIZx4_ROW xmm0, xmm1 1.595 + HORIZx4_ROW xmm2, xmm3 1.596 +%if %1 1.597 + movd xmm1, [rdi] 1.598 + pavgb xmm0, xmm1 1.599 + movd xmm3, [rdi + rdx] 1.600 + pavgb xmm2, xmm3 1.601 +%endif 1.602 + movd [rdi], xmm0 1.603 + movd [rdi +rdx], xmm2 1.604 + 1.605 + lea rsi, [rsi + rax] 1.606 + prefetcht0 [rsi + 4 * rax - 3] 1.607 + lea rsi, [rsi + rax] 1.608 + lea rdi, [rdi + 2 * rdx] 1.609 + prefetcht0 [rsi + 2 * rax - 3] 1.610 + 1.611 + dec rcx 1.612 + jnz .loop 1.613 + 1.614 + ; Do last row if output_height is odd 1.615 + movsxd rcx, dword ptr arg(4) ;output_height 1.616 + and rcx, 1 1.617 + je .done 1.618 + 1.619 + movq xmm0, [rsi - 3] ; load src 1.620 + movq xmm1, [rsi + 5] 1.621 + punpcklqdq xmm0, xmm1 1.622 + 1.623 + HORIZx4_ROW xmm0, xmm1 1.624 +%if %1 1.625 + movd xmm1, [rdi] 1.626 + pavgb xmm0, xmm1 1.627 +%endif 1.628 + movd [rdi], xmm0 1.629 +.done 1.630 +%endm 1.631 + 1.632 +%macro HORIZx8_ROW 4 1.633 + movdqa %2, %1 1.634 + movdqa %3, %1 1.635 + movdqa %4, %1 1.636 + 1.637 + pshufb %1, [GLOBAL(shuf_t0t1)] 1.638 + pshufb %2, [GLOBAL(shuf_t2t3)] 1.639 + pshufb %3, [GLOBAL(shuf_t4t5)] 1.640 + pshufb %4, [GLOBAL(shuf_t6t7)] 1.641 + 1.642 + pmaddubsw %1, k0k1 1.643 + pmaddubsw %2, k2k3 1.644 + pmaddubsw %3, k4k5 1.645 + pmaddubsw %4, k6k7 1.646 + 1.647 + paddsw %1, %4 1.648 + movdqa %4, %2 1.649 + pmaxsw %2, %3 1.650 + pminsw %3, %4 1.651 + paddsw %1, %3 1.652 + paddsw %1, %2 1.653 + 1.654 + paddsw %1, krd 1.655 + psraw %1, 7 1.656 + packuswb %1, %1 1.657 +%endm 1.658 + 1.659 +%macro HORIZx8 1 1.660 + mov rdx, arg(5) ;filter ptr 1.661 + mov rsi, arg(0) ;src_ptr 1.662 + mov rdi, arg(2) ;output_ptr 1.663 + mov rcx, 0x0400040 1.664 + 1.665 + movdqa xmm4, [rdx] ;load filters 1.666 + movd xmm5, rcx 1.667 + packsswb xmm4, xmm4 1.668 + pshuflw xmm0, xmm4, 0b ;k0_k1 1.669 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 1.670 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 1.671 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 1.672 + 1.673 + punpcklqdq xmm0, xmm0 1.674 + punpcklqdq xmm1, xmm1 1.675 + punpcklqdq xmm2, xmm2 1.676 + punpcklqdq xmm3, xmm3 1.677 + 1.678 + movdqa k0k1, xmm0 1.679 + movdqa k2k3, xmm1 1.680 + pshufd xmm5, xmm5, 0 1.681 + movdqa k4k5, xmm2 1.682 + movdqa k6k7, xmm3 1.683 + movdqa krd, xmm5 1.684 + 1.685 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.686 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.687 + movsxd rcx, dword ptr arg(4) ;output_height 1.688 + shr rcx, 1 1.689 + 1.690 +.loop: 1.691 + movq xmm0, [rsi - 3] ;load src 1.692 + movq xmm3, [rsi + 5] 1.693 + movq xmm4, [rsi + rax - 3] 1.694 + movq xmm7, [rsi + rax + 5] 1.695 + punpcklqdq xmm0, xmm3 1.696 + punpcklqdq xmm4, xmm7 1.697 + 1.698 + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 1.699 + HORIZx8_ROW xmm4, xmm5, xmm6, xmm7 1.700 +%if %1 1.701 + movq xmm1, [rdi] 1.702 + movq xmm2, [rdi + rdx] 1.703 + pavgb xmm0, xmm1 1.704 + pavgb xmm4, xmm2 1.705 +%endif 1.706 + movq [rdi], xmm0 1.707 + movq [rdi + rdx], xmm4 1.708 + 1.709 + lea rsi, [rsi + rax] 1.710 + prefetcht0 [rsi + 4 * rax - 3] 1.711 + lea rsi, [rsi + rax] 1.712 + lea rdi, [rdi + 2 * rdx] 1.713 + prefetcht0 [rsi + 2 * rax - 3] 1.714 + dec rcx 1.715 + jnz .loop 1.716 + 1.717 + ;Do last row if output_height is odd 1.718 + movsxd rcx, dword ptr arg(4) ;output_height 1.719 + and rcx, 1 1.720 + je .done 1.721 + 1.722 + movq xmm0, [rsi - 3] 1.723 + movq xmm3, [rsi + 5] 1.724 + punpcklqdq xmm0, xmm3 1.725 + 1.726 + HORIZx8_ROW xmm0, xmm1, xmm2, xmm3 1.727 +%if %1 1.728 + movq xmm1, [rdi] 1.729 + pavgb xmm0, xmm1 1.730 +%endif 1.731 + movq [rdi], xmm0 1.732 +.done 1.733 +%endm 1.734 + 1.735 +%macro HORIZx16 1 1.736 + mov rdx, arg(5) ;filter ptr 1.737 + mov rsi, arg(0) ;src_ptr 1.738 + mov rdi, arg(2) ;output_ptr 1.739 + mov rcx, 0x0400040 1.740 + 1.741 + movdqa xmm4, [rdx] ;load filters 1.742 + movq xmm5, rcx 1.743 + packsswb xmm4, xmm4 1.744 + pshuflw xmm0, xmm4, 0b ;k0_k1 1.745 + pshuflw xmm1, xmm4, 01010101b ;k2_k3 1.746 + pshuflw xmm2, xmm4, 10101010b ;k4_k5 1.747 + pshuflw xmm3, xmm4, 11111111b ;k6_k7 1.748 + 1.749 + punpcklqdq xmm0, xmm0 1.750 + punpcklqdq xmm1, xmm1 1.751 + punpcklqdq xmm2, xmm2 1.752 + punpcklqdq xmm3, xmm3 1.753 + 1.754 + movdqa k0k1, xmm0 1.755 + movdqa k2k3, xmm1 1.756 + pshufd xmm5, xmm5, 0 1.757 + movdqa k4k5, xmm2 1.758 + movdqa k6k7, xmm3 1.759 + movdqa krd, xmm5 1.760 + 1.761 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.762 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.763 + movsxd rcx, dword ptr arg(4) ;output_height 1.764 + 1.765 +.loop: 1.766 + prefetcht0 [rsi + 2 * rax -3] 1.767 + 1.768 + movq xmm0, [rsi - 3] ;load src data 1.769 + movq xmm4, [rsi + 5] 1.770 + movq xmm7, [rsi + 13] 1.771 + punpcklqdq xmm0, xmm4 1.772 + punpcklqdq xmm4, xmm7 1.773 + 1.774 + movdqa xmm1, xmm0 1.775 + movdqa xmm2, xmm0 1.776 + movdqa xmm3, xmm0 1.777 + movdqa xmm5, xmm4 1.778 + movdqa xmm6, xmm4 1.779 + movdqa xmm7, xmm4 1.780 + 1.781 + pshufb xmm0, [GLOBAL(shuf_t0t1)] 1.782 + pshufb xmm1, [GLOBAL(shuf_t2t3)] 1.783 + pshufb xmm2, [GLOBAL(shuf_t4t5)] 1.784 + pshufb xmm3, [GLOBAL(shuf_t6t7)] 1.785 + pshufb xmm4, [GLOBAL(shuf_t0t1)] 1.786 + pshufb xmm5, [GLOBAL(shuf_t2t3)] 1.787 + pshufb xmm6, [GLOBAL(shuf_t4t5)] 1.788 + pshufb xmm7, [GLOBAL(shuf_t6t7)] 1.789 + 1.790 + pmaddubsw xmm0, k0k1 1.791 + pmaddubsw xmm1, k2k3 1.792 + pmaddubsw xmm2, k4k5 1.793 + pmaddubsw xmm3, k6k7 1.794 + pmaddubsw xmm4, k0k1 1.795 + pmaddubsw xmm5, k2k3 1.796 + pmaddubsw xmm6, k4k5 1.797 + pmaddubsw xmm7, k6k7 1.798 + 1.799 + paddsw xmm0, xmm3 1.800 + movdqa xmm3, xmm1 1.801 + pmaxsw xmm1, xmm2 1.802 + pminsw xmm2, xmm3 1.803 + paddsw xmm0, xmm2 1.804 + paddsw xmm0, xmm1 1.805 + 1.806 + paddsw xmm4, xmm7 1.807 + movdqa xmm7, xmm5 1.808 + pmaxsw xmm5, xmm6 1.809 + pminsw xmm6, xmm7 1.810 + paddsw xmm4, xmm6 1.811 + paddsw xmm4, xmm5 1.812 + 1.813 + paddsw xmm0, krd 1.814 + paddsw xmm4, krd 1.815 + psraw xmm0, 7 1.816 + psraw xmm4, 7 1.817 + packuswb xmm0, xmm0 1.818 + packuswb xmm4, xmm4 1.819 + punpcklqdq xmm0, xmm4 1.820 +%if %1 1.821 + movdqa xmm1, [rdi] 1.822 + pavgb xmm0, xmm1 1.823 +%endif 1.824 + 1.825 + lea rsi, [rsi + rax] 1.826 + movdqa [rdi], xmm0 1.827 + 1.828 + lea rdi, [rdi + rdx] 1.829 + dec rcx 1.830 + jnz .loop 1.831 +%endm 1.832 + 1.833 +;void vp9_filter_block1d4_h8_ssse3 1.834 +;( 1.835 +; unsigned char *src_ptr, 1.836 +; unsigned int src_pixels_per_line, 1.837 +; unsigned char *output_ptr, 1.838 +; unsigned int output_pitch, 1.839 +; unsigned int output_height, 1.840 +; short *filter 1.841 +;) 1.842 +global sym(vp9_filter_block1d4_h8_ssse3) PRIVATE 1.843 +sym(vp9_filter_block1d4_h8_ssse3): 1.844 + push rbp 1.845 + mov rbp, rsp 1.846 + SHADOW_ARGS_TO_STACK 6 1.847 + SAVE_XMM 7 1.848 + GET_GOT rbx 1.849 + push rsi 1.850 + push rdi 1.851 + ; end prolog 1.852 + 1.853 + ALIGN_STACK 16, rax 1.854 + sub rsp, 16 * 3 1.855 + %define k0k1k4k5 [rsp + 16 * 0] 1.856 + %define k2k3k6k7 [rsp + 16 * 1] 1.857 + %define krd [rsp + 16 * 2] 1.858 + 1.859 + HORIZx4 0 1.860 + 1.861 + add rsp, 16 * 3 1.862 + pop rsp 1.863 + ; begin epilog 1.864 + pop rdi 1.865 + pop rsi 1.866 + RESTORE_GOT 1.867 + RESTORE_XMM 1.868 + UNSHADOW_ARGS 1.869 + pop rbp 1.870 + ret 1.871 + 1.872 +;void vp9_filter_block1d8_h8_ssse3 1.873 +;( 1.874 +; unsigned char *src_ptr, 1.875 +; unsigned int src_pixels_per_line, 1.876 +; unsigned char *output_ptr, 1.877 +; unsigned int output_pitch, 1.878 +; unsigned int output_height, 1.879 +; short *filter 1.880 +;) 1.881 +global sym(vp9_filter_block1d8_h8_ssse3) PRIVATE 1.882 +sym(vp9_filter_block1d8_h8_ssse3): 1.883 + push rbp 1.884 + mov rbp, rsp 1.885 + SHADOW_ARGS_TO_STACK 6 1.886 + SAVE_XMM 7 1.887 + GET_GOT rbx 1.888 + push rsi 1.889 + push rdi 1.890 + ; end prolog 1.891 + 1.892 + ALIGN_STACK 16, rax 1.893 + sub rsp, 16*5 1.894 + %define k0k1 [rsp + 16*0] 1.895 + %define k2k3 [rsp + 16*1] 1.896 + %define k4k5 [rsp + 16*2] 1.897 + %define k6k7 [rsp + 16*3] 1.898 + %define krd [rsp + 16*4] 1.899 + 1.900 + HORIZx8 0 1.901 + 1.902 + add rsp, 16*5 1.903 + pop rsp 1.904 + 1.905 + ; begin epilog 1.906 + pop rdi 1.907 + pop rsi 1.908 + RESTORE_GOT 1.909 + RESTORE_XMM 1.910 + UNSHADOW_ARGS 1.911 + pop rbp 1.912 + ret 1.913 + 1.914 +;void vp9_filter_block1d16_h8_ssse3 1.915 +;( 1.916 +; unsigned char *src_ptr, 1.917 +; unsigned int src_pixels_per_line, 1.918 +; unsigned char *output_ptr, 1.919 +; unsigned int output_pitch, 1.920 +; unsigned int output_height, 1.921 +; short *filter 1.922 +;) 1.923 +global sym(vp9_filter_block1d16_h8_ssse3) PRIVATE 1.924 +sym(vp9_filter_block1d16_h8_ssse3): 1.925 + push rbp 1.926 + mov rbp, rsp 1.927 + SHADOW_ARGS_TO_STACK 6 1.928 + SAVE_XMM 7 1.929 + GET_GOT rbx 1.930 + push rsi 1.931 + push rdi 1.932 + ; end prolog 1.933 + 1.934 + ALIGN_STACK 16, rax 1.935 + sub rsp, 16*5 1.936 + %define k0k1 [rsp + 16*0] 1.937 + %define k2k3 [rsp + 16*1] 1.938 + %define k4k5 [rsp + 16*2] 1.939 + %define k6k7 [rsp + 16*3] 1.940 + %define krd [rsp + 16*4] 1.941 + 1.942 + HORIZx16 0 1.943 + 1.944 + add rsp, 16*5 1.945 + pop rsp 1.946 + 1.947 + ; begin epilog 1.948 + pop rdi 1.949 + pop rsi 1.950 + RESTORE_GOT 1.951 + RESTORE_XMM 1.952 + UNSHADOW_ARGS 1.953 + pop rbp 1.954 + ret 1.955 + 1.956 +global sym(vp9_filter_block1d4_h8_avg_ssse3) PRIVATE 1.957 +sym(vp9_filter_block1d4_h8_avg_ssse3): 1.958 + push rbp 1.959 + mov rbp, rsp 1.960 + SHADOW_ARGS_TO_STACK 6 1.961 + SAVE_XMM 7 1.962 + GET_GOT rbx 1.963 + push rsi 1.964 + push rdi 1.965 + ; end prolog 1.966 + 1.967 + ALIGN_STACK 16, rax 1.968 + sub rsp, 16 * 3 1.969 + %define k0k1k4k5 [rsp + 16 * 0] 1.970 + %define k2k3k6k7 [rsp + 16 * 1] 1.971 + %define krd [rsp + 16 * 2] 1.972 + 1.973 + HORIZx4 1 1.974 + 1.975 + add rsp, 16 * 3 1.976 + pop rsp 1.977 + ; begin epilog 1.978 + pop rdi 1.979 + pop rsi 1.980 + RESTORE_GOT 1.981 + RESTORE_XMM 1.982 + UNSHADOW_ARGS 1.983 + pop rbp 1.984 + ret 1.985 + 1.986 +global sym(vp9_filter_block1d8_h8_avg_ssse3) PRIVATE 1.987 +sym(vp9_filter_block1d8_h8_avg_ssse3): 1.988 + push rbp 1.989 + mov rbp, rsp 1.990 + SHADOW_ARGS_TO_STACK 6 1.991 + SAVE_XMM 7 1.992 + GET_GOT rbx 1.993 + push rsi 1.994 + push rdi 1.995 + ; end prolog 1.996 + 1.997 + ALIGN_STACK 16, rax 1.998 + sub rsp, 16*5 1.999 + %define k0k1 [rsp + 16*0] 1.1000 + %define k2k3 [rsp + 16*1] 1.1001 + %define k4k5 [rsp + 16*2] 1.1002 + %define k6k7 [rsp + 16*3] 1.1003 + %define krd [rsp + 16*4] 1.1004 + 1.1005 + HORIZx8 1 1.1006 + 1.1007 + add rsp, 16*5 1.1008 + pop rsp 1.1009 + 1.1010 + ; begin epilog 1.1011 + pop rdi 1.1012 + pop rsi 1.1013 + RESTORE_GOT 1.1014 + RESTORE_XMM 1.1015 + UNSHADOW_ARGS 1.1016 + pop rbp 1.1017 + ret 1.1018 + 1.1019 +global sym(vp9_filter_block1d16_h8_avg_ssse3) PRIVATE 1.1020 +sym(vp9_filter_block1d16_h8_avg_ssse3): 1.1021 + push rbp 1.1022 + mov rbp, rsp 1.1023 + SHADOW_ARGS_TO_STACK 6 1.1024 + SAVE_XMM 7 1.1025 + GET_GOT rbx 1.1026 + push rsi 1.1027 + push rdi 1.1028 + ; end prolog 1.1029 + 1.1030 + ALIGN_STACK 16, rax 1.1031 + sub rsp, 16*5 1.1032 + %define k0k1 [rsp + 16*0] 1.1033 + %define k2k3 [rsp + 16*1] 1.1034 + %define k4k5 [rsp + 16*2] 1.1035 + %define k6k7 [rsp + 16*3] 1.1036 + %define krd [rsp + 16*4] 1.1037 + 1.1038 + HORIZx16 1 1.1039 + 1.1040 + add rsp, 16*5 1.1041 + pop rsp 1.1042 + 1.1043 + ; begin epilog 1.1044 + pop rdi 1.1045 + pop rsi 1.1046 + RESTORE_GOT 1.1047 + RESTORE_XMM 1.1048 + UNSHADOW_ARGS 1.1049 + pop rbp 1.1050 + ret 1.1051 +SECTION_RODATA 1.1052 +align 16 1.1053 +shuf_t0t1: 1.1054 + db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 1.1055 +align 16 1.1056 +shuf_t2t3: 1.1057 + db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 1.1058 +align 16 1.1059 +shuf_t4t5: 1.1060 + db 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12 1.1061 +align 16 1.1062 +shuf_t6t7: 1.1063 + db 6, 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13, 14