1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/subpixel_ssse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1508 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%define BLOCK_HEIGHT_WIDTH 4 1.18 +%define VP8_FILTER_WEIGHT 128 1.19 +%define VP8_FILTER_SHIFT 7 1.20 + 1.21 + 1.22 +;/************************************************************************************ 1.23 +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 1.24 +; input pixel array has output_height rows. This routine assumes that output_height is an 1.25 +; even number. This function handles 8 pixels in horizontal direction, calculating ONE 1.26 +; rows each iteration to take advantage of the 128 bits operations. 1.27 +; 1.28 +; This is an implementation of some of the SSE optimizations first seen in ffvp8 1.29 +; 1.30 +;*************************************************************************************/ 1.31 +;void vp8_filter_block1d8_h6_ssse3 1.32 +;( 1.33 +; unsigned char *src_ptr, 1.34 +; unsigned int src_pixels_per_line, 1.35 +; unsigned char *output_ptr, 1.36 +; unsigned int output_pitch, 1.37 +; unsigned int output_height, 1.38 +; unsigned int vp8_filter_index 1.39 +;) 1.40 +global sym(vp8_filter_block1d8_h6_ssse3) PRIVATE 1.41 +sym(vp8_filter_block1d8_h6_ssse3): 1.42 + push rbp 1.43 + mov rbp, rsp 1.44 + SHADOW_ARGS_TO_STACK 6 1.45 + SAVE_XMM 7 1.46 + GET_GOT rbx 1.47 + push rsi 1.48 + push rdi 1.49 + ; end prolog 1.50 + 1.51 + movsxd rdx, DWORD PTR arg(5) ;table index 1.52 + xor rsi, rsi 1.53 + shl rdx, 4 1.54 + 1.55 + movdqa xmm7, [GLOBAL(rd)] 1.56 + 1.57 + lea rax, [GLOBAL(k0_k5)] 1.58 + add rax, rdx 1.59 + mov rdi, arg(2) ;output_ptr 1.60 + 1.61 + cmp esi, DWORD PTR [rax] 1.62 + je vp8_filter_block1d8_h4_ssse3 1.63 + 1.64 + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 1.65 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 1.66 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 1.67 + 1.68 + mov rsi, arg(0) ;src_ptr 1.69 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.70 + movsxd rcx, dword ptr arg(4) ;output_height 1.71 + 1.72 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.73 + 1.74 + sub rdi, rdx 1.75 +;xmm3 free 1.76 +.filter_block1d8_h6_rowloop_ssse3: 1.77 + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 1.78 + 1.79 + movq xmm2, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 1.80 + 1.81 + punpcklbw xmm0, xmm2 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 1.82 + 1.83 + movdqa xmm1, xmm0 1.84 + pmaddubsw xmm0, xmm4 1.85 + 1.86 + movdqa xmm2, xmm1 1.87 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] 1.88 + 1.89 + pshufb xmm2, [GLOBAL(shuf3bfrom1)] 1.90 + pmaddubsw xmm1, xmm5 1.91 + 1.92 + lea rdi, [rdi + rdx] 1.93 + pmaddubsw xmm2, xmm6 1.94 + 1.95 + lea rsi, [rsi + rax] 1.96 + dec rcx 1.97 + 1.98 + paddsw xmm0, xmm1 1.99 + paddsw xmm2, xmm7 1.100 + 1.101 + paddsw xmm0, xmm2 1.102 + 1.103 + psraw xmm0, 7 1.104 + 1.105 + packuswb xmm0, xmm0 1.106 + 1.107 + movq MMWORD Ptr [rdi], xmm0 1.108 + jnz .filter_block1d8_h6_rowloop_ssse3 1.109 + 1.110 + ; begin epilog 1.111 + pop rdi 1.112 + pop rsi 1.113 + RESTORE_GOT 1.114 + RESTORE_XMM 1.115 + UNSHADOW_ARGS 1.116 + pop rbp 1.117 + ret 1.118 + 1.119 +vp8_filter_block1d8_h4_ssse3: 1.120 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 1.121 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 1.122 + 1.123 + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf2bfrom1)] 1.124 + movdqa xmm4, XMMWORD PTR [GLOBAL(shuf3bfrom1)] 1.125 + 1.126 + mov rsi, arg(0) ;src_ptr 1.127 + 1.128 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.129 + movsxd rcx, dword ptr arg(4) ;output_height 1.130 + 1.131 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.132 + 1.133 + sub rdi, rdx 1.134 + 1.135 +.filter_block1d8_h4_rowloop_ssse3: 1.136 + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 1.137 + 1.138 + movq xmm1, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 1.139 + 1.140 + punpcklbw xmm0, xmm1 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 1.141 + 1.142 + movdqa xmm2, xmm0 1.143 + pshufb xmm0, xmm3 1.144 + 1.145 + pshufb xmm2, xmm4 1.146 + pmaddubsw xmm0, xmm5 1.147 + 1.148 + lea rdi, [rdi + rdx] 1.149 + pmaddubsw xmm2, xmm6 1.150 + 1.151 + lea rsi, [rsi + rax] 1.152 + dec rcx 1.153 + 1.154 + paddsw xmm0, xmm7 1.155 + 1.156 + paddsw xmm0, xmm2 1.157 + 1.158 + psraw xmm0, 7 1.159 + 1.160 + packuswb xmm0, xmm0 1.161 + 1.162 + movq MMWORD Ptr [rdi], xmm0 1.163 + 1.164 + jnz .filter_block1d8_h4_rowloop_ssse3 1.165 + 1.166 + ; begin epilog 1.167 + pop rdi 1.168 + pop rsi 1.169 + RESTORE_GOT 1.170 + RESTORE_XMM 1.171 + UNSHADOW_ARGS 1.172 + pop rbp 1.173 + ret 1.174 +;void vp8_filter_block1d16_h6_ssse3 1.175 +;( 1.176 +; unsigned char *src_ptr, 1.177 +; unsigned int src_pixels_per_line, 1.178 +; unsigned char *output_ptr, 1.179 +; unsigned int output_pitch, 1.180 +; unsigned int output_height, 1.181 +; unsigned int vp8_filter_index 1.182 +;) 1.183 +global sym(vp8_filter_block1d16_h6_ssse3) PRIVATE 1.184 +sym(vp8_filter_block1d16_h6_ssse3): 1.185 + push rbp 1.186 + mov rbp, rsp 1.187 + SHADOW_ARGS_TO_STACK 6 1.188 + SAVE_XMM 7 1.189 + GET_GOT rbx 1.190 + push rsi 1.191 + push rdi 1.192 + ; end prolog 1.193 + 1.194 + movsxd rdx, DWORD PTR arg(5) ;table index 1.195 + xor rsi, rsi 1.196 + shl rdx, 4 ; 1.197 + 1.198 + lea rax, [GLOBAL(k0_k5)] 1.199 + add rax, rdx 1.200 + 1.201 + mov rdi, arg(2) ;output_ptr 1.202 + 1.203 + mov rsi, arg(0) ;src_ptr 1.204 + 1.205 + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 1.206 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 1.207 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 1.208 + 1.209 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.210 + movsxd rcx, dword ptr arg(4) ;output_height 1.211 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.212 + 1.213 +.filter_block1d16_h6_rowloop_ssse3: 1.214 + movq xmm0, MMWORD PTR [rsi - 2] ; -2 -1 0 1 2 3 4 5 1.215 + 1.216 + movq xmm3, MMWORD PTR [rsi + 3] ; 3 4 5 6 7 8 9 10 1.217 + 1.218 + punpcklbw xmm0, xmm3 ; -2 3 -1 4 0 5 1 6 2 7 3 8 4 9 5 10 1.219 + 1.220 + movdqa xmm1, xmm0 1.221 + pmaddubsw xmm0, xmm4 1.222 + 1.223 + movdqa xmm2, xmm1 1.224 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] 1.225 + 1.226 + pshufb xmm2, [GLOBAL(shuf3bfrom1)] 1.227 + movq xmm3, MMWORD PTR [rsi + 6] 1.228 + 1.229 + pmaddubsw xmm1, xmm5 1.230 + movq xmm7, MMWORD PTR [rsi + 11] 1.231 + 1.232 + pmaddubsw xmm2, xmm6 1.233 + punpcklbw xmm3, xmm7 1.234 + 1.235 + paddsw xmm0, xmm1 1.236 + movdqa xmm1, xmm3 1.237 + 1.238 + pmaddubsw xmm3, xmm4 1.239 + paddsw xmm0, xmm2 1.240 + 1.241 + movdqa xmm2, xmm1 1.242 + paddsw xmm0, [GLOBAL(rd)] 1.243 + 1.244 + pshufb xmm1, [GLOBAL(shuf2bfrom1)] 1.245 + pshufb xmm2, [GLOBAL(shuf3bfrom1)] 1.246 + 1.247 + psraw xmm0, 7 1.248 + pmaddubsw xmm1, xmm5 1.249 + 1.250 + pmaddubsw xmm2, xmm6 1.251 + packuswb xmm0, xmm0 1.252 + 1.253 + lea rsi, [rsi + rax] 1.254 + paddsw xmm3, xmm1 1.255 + 1.256 + paddsw xmm3, xmm2 1.257 + 1.258 + paddsw xmm3, [GLOBAL(rd)] 1.259 + 1.260 + psraw xmm3, 7 1.261 + 1.262 + packuswb xmm3, xmm3 1.263 + 1.264 + punpcklqdq xmm0, xmm3 1.265 + 1.266 + movdqa XMMWORD Ptr [rdi], xmm0 1.267 + 1.268 + lea rdi, [rdi + rdx] 1.269 + dec rcx 1.270 + jnz .filter_block1d16_h6_rowloop_ssse3 1.271 + 1.272 + ; begin epilog 1.273 + pop rdi 1.274 + pop rsi 1.275 + RESTORE_GOT 1.276 + RESTORE_XMM 1.277 + UNSHADOW_ARGS 1.278 + pop rbp 1.279 + ret 1.280 + 1.281 +;void vp8_filter_block1d4_h6_ssse3 1.282 +;( 1.283 +; unsigned char *src_ptr, 1.284 +; unsigned int src_pixels_per_line, 1.285 +; unsigned char *output_ptr, 1.286 +; unsigned int output_pitch, 1.287 +; unsigned int output_height, 1.288 +; unsigned int vp8_filter_index 1.289 +;) 1.290 +global sym(vp8_filter_block1d4_h6_ssse3) PRIVATE 1.291 +sym(vp8_filter_block1d4_h6_ssse3): 1.292 + push rbp 1.293 + mov rbp, rsp 1.294 + SHADOW_ARGS_TO_STACK 6 1.295 + SAVE_XMM 7 1.296 + GET_GOT rbx 1.297 + push rsi 1.298 + push rdi 1.299 + ; end prolog 1.300 + 1.301 + movsxd rdx, DWORD PTR arg(5) ;table index 1.302 + xor rsi, rsi 1.303 + shl rdx, 4 ; 1.304 + 1.305 + lea rax, [GLOBAL(k0_k5)] 1.306 + add rax, rdx 1.307 + movdqa xmm7, [GLOBAL(rd)] 1.308 + 1.309 + cmp esi, DWORD PTR [rax] 1.310 + je .vp8_filter_block1d4_h4_ssse3 1.311 + 1.312 + movdqa xmm4, XMMWORD PTR [rax] ;k0_k5 1.313 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 1.314 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 1.315 + 1.316 + mov rsi, arg(0) ;src_ptr 1.317 + mov rdi, arg(2) ;output_ptr 1.318 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.319 + movsxd rcx, dword ptr arg(4) ;output_height 1.320 + 1.321 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.322 + 1.323 +;xmm3 free 1.324 +.filter_block1d4_h6_rowloop_ssse3: 1.325 + movdqu xmm0, XMMWORD PTR [rsi - 2] 1.326 + 1.327 + movdqa xmm1, xmm0 1.328 + pshufb xmm0, [GLOBAL(shuf1b)] 1.329 + 1.330 + movdqa xmm2, xmm1 1.331 + pshufb xmm1, [GLOBAL(shuf2b)] 1.332 + pmaddubsw xmm0, xmm4 1.333 + pshufb xmm2, [GLOBAL(shuf3b)] 1.334 + pmaddubsw xmm1, xmm5 1.335 + 1.336 +;-- 1.337 + pmaddubsw xmm2, xmm6 1.338 + 1.339 + lea rsi, [rsi + rax] 1.340 +;-- 1.341 + paddsw xmm0, xmm1 1.342 + paddsw xmm0, xmm7 1.343 + pxor xmm1, xmm1 1.344 + paddsw xmm0, xmm2 1.345 + psraw xmm0, 7 1.346 + packuswb xmm0, xmm0 1.347 + 1.348 + movd DWORD PTR [rdi], xmm0 1.349 + 1.350 + add rdi, rdx 1.351 + dec rcx 1.352 + jnz .filter_block1d4_h6_rowloop_ssse3 1.353 + 1.354 + ; begin epilog 1.355 + pop rdi 1.356 + pop rsi 1.357 + RESTORE_GOT 1.358 + RESTORE_XMM 1.359 + UNSHADOW_ARGS 1.360 + pop rbp 1.361 + ret 1.362 + 1.363 +.vp8_filter_block1d4_h4_ssse3: 1.364 + movdqa xmm5, XMMWORD PTR [rax+256] ;k2_k4 1.365 + movdqa xmm6, XMMWORD PTR [rax+128] ;k1_k3 1.366 + movdqa xmm0, XMMWORD PTR [GLOBAL(shuf2b)] 1.367 + movdqa xmm3, XMMWORD PTR [GLOBAL(shuf3b)] 1.368 + 1.369 + mov rsi, arg(0) ;src_ptr 1.370 + mov rdi, arg(2) ;output_ptr 1.371 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.372 + movsxd rcx, dword ptr arg(4) ;output_height 1.373 + 1.374 + movsxd rdx, dword ptr arg(3) ;output_pitch 1.375 + 1.376 +.filter_block1d4_h4_rowloop_ssse3: 1.377 + movdqu xmm1, XMMWORD PTR [rsi - 2] 1.378 + 1.379 + movdqa xmm2, xmm1 1.380 + pshufb xmm1, xmm0 ;;[GLOBAL(shuf2b)] 1.381 + pshufb xmm2, xmm3 ;;[GLOBAL(shuf3b)] 1.382 + pmaddubsw xmm1, xmm5 1.383 + 1.384 +;-- 1.385 + pmaddubsw xmm2, xmm6 1.386 + 1.387 + lea rsi, [rsi + rax] 1.388 +;-- 1.389 + paddsw xmm1, xmm7 1.390 + paddsw xmm1, xmm2 1.391 + psraw xmm1, 7 1.392 + packuswb xmm1, xmm1 1.393 + 1.394 + movd DWORD PTR [rdi], xmm1 1.395 + 1.396 + add rdi, rdx 1.397 + dec rcx 1.398 + jnz .filter_block1d4_h4_rowloop_ssse3 1.399 + 1.400 + ; begin epilog 1.401 + pop rdi 1.402 + pop rsi 1.403 + RESTORE_GOT 1.404 + RESTORE_XMM 1.405 + UNSHADOW_ARGS 1.406 + pop rbp 1.407 + ret 1.408 + 1.409 + 1.410 + 1.411 +;void vp8_filter_block1d16_v6_ssse3 1.412 +;( 1.413 +; unsigned char *src_ptr, 1.414 +; unsigned int src_pitch, 1.415 +; unsigned char *output_ptr, 1.416 +; unsigned int out_pitch, 1.417 +; unsigned int output_height, 1.418 +; unsigned int vp8_filter_index 1.419 +;) 1.420 +global sym(vp8_filter_block1d16_v6_ssse3) PRIVATE 1.421 +sym(vp8_filter_block1d16_v6_ssse3): 1.422 + push rbp 1.423 + mov rbp, rsp 1.424 + SHADOW_ARGS_TO_STACK 6 1.425 + SAVE_XMM 7 1.426 + GET_GOT rbx 1.427 + push rsi 1.428 + push rdi 1.429 + ; end prolog 1.430 + 1.431 + movsxd rdx, DWORD PTR arg(5) ;table index 1.432 + xor rsi, rsi 1.433 + shl rdx, 4 ; 1.434 + 1.435 + lea rax, [GLOBAL(k0_k5)] 1.436 + add rax, rdx 1.437 + 1.438 + cmp esi, DWORD PTR [rax] 1.439 + je .vp8_filter_block1d16_v4_ssse3 1.440 + 1.441 + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 1.442 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 1.443 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 1.444 + 1.445 + mov rsi, arg(0) ;src_ptr 1.446 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.447 + mov rdi, arg(2) ;output_ptr 1.448 + 1.449 +%if ABI_IS_32BIT=0 1.450 + movsxd r8, DWORD PTR arg(3) ;out_pitch 1.451 +%endif 1.452 + mov rax, rsi 1.453 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.454 + add rax, rdx 1.455 + 1.456 + 1.457 +.vp8_filter_block1d16_v6_ssse3_loop: 1.458 + movq xmm1, MMWORD PTR [rsi] ;A 1.459 + movq xmm2, MMWORD PTR [rsi + rdx] ;B 1.460 + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 1.461 + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 1.462 + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 1.463 + 1.464 + punpcklbw xmm2, xmm4 ;B D 1.465 + punpcklbw xmm3, xmm0 ;C E 1.466 + 1.467 + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 1.468 + 1.469 + pmaddubsw xmm3, xmm6 1.470 + punpcklbw xmm1, xmm0 ;A F 1.471 + pmaddubsw xmm2, xmm7 1.472 + pmaddubsw xmm1, xmm5 1.473 + 1.474 + paddsw xmm2, xmm3 1.475 + paddsw xmm2, xmm1 1.476 + paddsw xmm2, [GLOBAL(rd)] 1.477 + psraw xmm2, 7 1.478 + packuswb xmm2, xmm2 1.479 + 1.480 + movq MMWORD PTR [rdi], xmm2 ;store the results 1.481 + 1.482 + movq xmm1, MMWORD PTR [rsi + 8] ;A 1.483 + movq xmm2, MMWORD PTR [rsi + rdx + 8] ;B 1.484 + movq xmm3, MMWORD PTR [rsi + rdx * 2 + 8] ;C 1.485 + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 1.486 + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 1.487 + 1.488 + punpcklbw xmm2, xmm4 ;B D 1.489 + punpcklbw xmm3, xmm0 ;C E 1.490 + 1.491 + movq xmm0, MMWORD PTR [rax + rdx * 4 + 8] ;F 1.492 + pmaddubsw xmm3, xmm6 1.493 + punpcklbw xmm1, xmm0 ;A F 1.494 + pmaddubsw xmm2, xmm7 1.495 + pmaddubsw xmm1, xmm5 1.496 + 1.497 + add rsi, rdx 1.498 + add rax, rdx 1.499 +;-- 1.500 +;-- 1.501 + paddsw xmm2, xmm3 1.502 + paddsw xmm2, xmm1 1.503 + paddsw xmm2, [GLOBAL(rd)] 1.504 + psraw xmm2, 7 1.505 + packuswb xmm2, xmm2 1.506 + 1.507 + movq MMWORD PTR [rdi+8], xmm2 1.508 + 1.509 +%if ABI_IS_32BIT 1.510 + add rdi, DWORD PTR arg(3) ;out_pitch 1.511 +%else 1.512 + add rdi, r8 1.513 +%endif 1.514 + dec rcx 1.515 + jnz .vp8_filter_block1d16_v6_ssse3_loop 1.516 + 1.517 + ; begin epilog 1.518 + pop rdi 1.519 + pop rsi 1.520 + RESTORE_GOT 1.521 + RESTORE_XMM 1.522 + UNSHADOW_ARGS 1.523 + pop rbp 1.524 + ret 1.525 + 1.526 +.vp8_filter_block1d16_v4_ssse3: 1.527 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 1.528 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 1.529 + 1.530 + mov rsi, arg(0) ;src_ptr 1.531 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.532 + mov rdi, arg(2) ;output_ptr 1.533 + 1.534 +%if ABI_IS_32BIT=0 1.535 + movsxd r8, DWORD PTR arg(3) ;out_pitch 1.536 +%endif 1.537 + mov rax, rsi 1.538 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.539 + add rax, rdx 1.540 + 1.541 +.vp8_filter_block1d16_v4_ssse3_loop: 1.542 + movq xmm2, MMWORD PTR [rsi + rdx] ;B 1.543 + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 1.544 + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 1.545 + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 1.546 + 1.547 + punpcklbw xmm2, xmm4 ;B D 1.548 + punpcklbw xmm3, xmm0 ;C E 1.549 + 1.550 + pmaddubsw xmm3, xmm6 1.551 + pmaddubsw xmm2, xmm7 1.552 + movq xmm5, MMWORD PTR [rsi + rdx + 8] ;B 1.553 + movq xmm1, MMWORD PTR [rsi + rdx * 2 + 8] ;C 1.554 + movq xmm4, MMWORD PTR [rax + rdx * 2 + 8] ;D 1.555 + movq xmm0, MMWORD PTR [rsi + rdx * 4 + 8] ;E 1.556 + 1.557 + paddsw xmm2, [GLOBAL(rd)] 1.558 + paddsw xmm2, xmm3 1.559 + psraw xmm2, 7 1.560 + packuswb xmm2, xmm2 1.561 + 1.562 + punpcklbw xmm5, xmm4 ;B D 1.563 + punpcklbw xmm1, xmm0 ;C E 1.564 + 1.565 + pmaddubsw xmm1, xmm6 1.566 + pmaddubsw xmm5, xmm7 1.567 + 1.568 + movdqa xmm4, [GLOBAL(rd)] 1.569 + add rsi, rdx 1.570 + add rax, rdx 1.571 +;-- 1.572 +;-- 1.573 + paddsw xmm5, xmm1 1.574 + paddsw xmm5, xmm4 1.575 + psraw xmm5, 7 1.576 + packuswb xmm5, xmm5 1.577 + 1.578 + punpcklqdq xmm2, xmm5 1.579 + 1.580 + movdqa XMMWORD PTR [rdi], xmm2 1.581 + 1.582 +%if ABI_IS_32BIT 1.583 + add rdi, DWORD PTR arg(3) ;out_pitch 1.584 +%else 1.585 + add rdi, r8 1.586 +%endif 1.587 + dec rcx 1.588 + jnz .vp8_filter_block1d16_v4_ssse3_loop 1.589 + 1.590 + ; begin epilog 1.591 + pop rdi 1.592 + pop rsi 1.593 + RESTORE_GOT 1.594 + RESTORE_XMM 1.595 + UNSHADOW_ARGS 1.596 + pop rbp 1.597 + ret 1.598 + 1.599 +;void vp8_filter_block1d8_v6_ssse3 1.600 +;( 1.601 +; unsigned char *src_ptr, 1.602 +; unsigned int src_pitch, 1.603 +; unsigned char *output_ptr, 1.604 +; unsigned int out_pitch, 1.605 +; unsigned int output_height, 1.606 +; unsigned int vp8_filter_index 1.607 +;) 1.608 +global sym(vp8_filter_block1d8_v6_ssse3) PRIVATE 1.609 +sym(vp8_filter_block1d8_v6_ssse3): 1.610 + push rbp 1.611 + mov rbp, rsp 1.612 + SHADOW_ARGS_TO_STACK 6 1.613 + SAVE_XMM 7 1.614 + GET_GOT rbx 1.615 + push rsi 1.616 + push rdi 1.617 + ; end prolog 1.618 + 1.619 + movsxd rdx, DWORD PTR arg(5) ;table index 1.620 + xor rsi, rsi 1.621 + shl rdx, 4 ; 1.622 + 1.623 + lea rax, [GLOBAL(k0_k5)] 1.624 + add rax, rdx 1.625 + 1.626 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.627 + mov rdi, arg(2) ;output_ptr 1.628 +%if ABI_IS_32BIT=0 1.629 + movsxd r8, DWORD PTR arg(3) ; out_pitch 1.630 +%endif 1.631 + movsxd rcx, DWORD PTR arg(4) ;[output_height] 1.632 + 1.633 + cmp esi, DWORD PTR [rax] 1.634 + je .vp8_filter_block1d8_v4_ssse3 1.635 + 1.636 + movdqa xmm5, XMMWORD PTR [rax] ;k0_k5 1.637 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 1.638 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 1.639 + 1.640 + mov rsi, arg(0) ;src_ptr 1.641 + 1.642 + mov rax, rsi 1.643 + add rax, rdx 1.644 + 1.645 +.vp8_filter_block1d8_v6_ssse3_loop: 1.646 + movq xmm1, MMWORD PTR [rsi] ;A 1.647 + movq xmm2, MMWORD PTR [rsi + rdx] ;B 1.648 + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 1.649 + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 1.650 + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 1.651 + 1.652 + punpcklbw xmm2, xmm4 ;B D 1.653 + punpcklbw xmm3, xmm0 ;C E 1.654 + 1.655 + movq xmm0, MMWORD PTR [rax + rdx * 4] ;F 1.656 + movdqa xmm4, [GLOBAL(rd)] 1.657 + 1.658 + pmaddubsw xmm3, xmm6 1.659 + punpcklbw xmm1, xmm0 ;A F 1.660 + pmaddubsw xmm2, xmm7 1.661 + pmaddubsw xmm1, xmm5 1.662 + add rsi, rdx 1.663 + add rax, rdx 1.664 +;-- 1.665 +;-- 1.666 + paddsw xmm2, xmm3 1.667 + paddsw xmm2, xmm1 1.668 + paddsw xmm2, xmm4 1.669 + psraw xmm2, 7 1.670 + packuswb xmm2, xmm2 1.671 + 1.672 + movq MMWORD PTR [rdi], xmm2 1.673 + 1.674 +%if ABI_IS_32BIT 1.675 + add rdi, DWORD PTR arg(3) ;[out_pitch] 1.676 +%else 1.677 + add rdi, r8 1.678 +%endif 1.679 + dec rcx 1.680 + jnz .vp8_filter_block1d8_v6_ssse3_loop 1.681 + 1.682 + ; begin epilog 1.683 + pop rdi 1.684 + pop rsi 1.685 + RESTORE_GOT 1.686 + RESTORE_XMM 1.687 + UNSHADOW_ARGS 1.688 + pop rbp 1.689 + ret 1.690 + 1.691 +.vp8_filter_block1d8_v4_ssse3: 1.692 + movdqa xmm6, XMMWORD PTR [rax+256] ;k2_k4 1.693 + movdqa xmm7, XMMWORD PTR [rax+128] ;k1_k3 1.694 + movdqa xmm5, [GLOBAL(rd)] 1.695 + 1.696 + mov rsi, arg(0) ;src_ptr 1.697 + 1.698 + mov rax, rsi 1.699 + add rax, rdx 1.700 + 1.701 +.vp8_filter_block1d8_v4_ssse3_loop: 1.702 + movq xmm2, MMWORD PTR [rsi + rdx] ;B 1.703 + movq xmm3, MMWORD PTR [rsi + rdx * 2] ;C 1.704 + movq xmm4, MMWORD PTR [rax + rdx * 2] ;D 1.705 + movq xmm0, MMWORD PTR [rsi + rdx * 4] ;E 1.706 + 1.707 + punpcklbw xmm2, xmm4 ;B D 1.708 + punpcklbw xmm3, xmm0 ;C E 1.709 + 1.710 + pmaddubsw xmm3, xmm6 1.711 + pmaddubsw xmm2, xmm7 1.712 + add rsi, rdx 1.713 + add rax, rdx 1.714 +;-- 1.715 +;-- 1.716 + paddsw xmm2, xmm3 1.717 + paddsw xmm2, xmm5 1.718 + psraw xmm2, 7 1.719 + packuswb xmm2, xmm2 1.720 + 1.721 + movq MMWORD PTR [rdi], xmm2 1.722 + 1.723 +%if ABI_IS_32BIT 1.724 + add rdi, DWORD PTR arg(3) ;[out_pitch] 1.725 +%else 1.726 + add rdi, r8 1.727 +%endif 1.728 + dec rcx 1.729 + jnz .vp8_filter_block1d8_v4_ssse3_loop 1.730 + 1.731 + ; begin epilog 1.732 + pop rdi 1.733 + pop rsi 1.734 + RESTORE_GOT 1.735 + RESTORE_XMM 1.736 + UNSHADOW_ARGS 1.737 + pop rbp 1.738 + ret 1.739 +;void vp8_filter_block1d4_v6_ssse3 1.740 +;( 1.741 +; unsigned char *src_ptr, 1.742 +; unsigned int src_pitch, 1.743 +; unsigned char *output_ptr, 1.744 +; unsigned int out_pitch, 1.745 +; unsigned int output_height, 1.746 +; unsigned int vp8_filter_index 1.747 +;) 1.748 +global sym(vp8_filter_block1d4_v6_ssse3) PRIVATE 1.749 +sym(vp8_filter_block1d4_v6_ssse3): 1.750 + push rbp 1.751 + mov rbp, rsp 1.752 + SHADOW_ARGS_TO_STACK 6 1.753 + GET_GOT rbx 1.754 + push rsi 1.755 + push rdi 1.756 + ; end prolog 1.757 + 1.758 + movsxd rdx, DWORD PTR arg(5) ;table index 1.759 + xor rsi, rsi 1.760 + shl rdx, 4 ; 1.761 + 1.762 + lea rax, [GLOBAL(k0_k5)] 1.763 + add rax, rdx 1.764 + 1.765 + movsxd rdx, DWORD PTR arg(1) ;pixels_per_line 1.766 + mov rdi, arg(2) ;output_ptr 1.767 +%if ABI_IS_32BIT=0 1.768 + movsxd r8, DWORD PTR arg(3) ; out_pitch 1.769 +%endif 1.770 + movsxd rcx, DWORD PTR arg(4) ;[output_height] 1.771 + 1.772 + cmp esi, DWORD PTR [rax] 1.773 + je .vp8_filter_block1d4_v4_ssse3 1.774 + 1.775 + movq mm5, MMWORD PTR [rax] ;k0_k5 1.776 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 1.777 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 1.778 + 1.779 + mov rsi, arg(0) ;src_ptr 1.780 + 1.781 + mov rax, rsi 1.782 + add rax, rdx 1.783 + 1.784 +.vp8_filter_block1d4_v6_ssse3_loop: 1.785 + movd mm1, DWORD PTR [rsi] ;A 1.786 + movd mm2, DWORD PTR [rsi + rdx] ;B 1.787 + movd mm3, DWORD PTR [rsi + rdx * 2] ;C 1.788 + movd mm4, DWORD PTR [rax + rdx * 2] ;D 1.789 + movd mm0, DWORD PTR [rsi + rdx * 4] ;E 1.790 + 1.791 + punpcklbw mm2, mm4 ;B D 1.792 + punpcklbw mm3, mm0 ;C E 1.793 + 1.794 + movd mm0, DWORD PTR [rax + rdx * 4] ;F 1.795 + 1.796 + movq mm4, [GLOBAL(rd)] 1.797 + 1.798 + pmaddubsw mm3, mm6 1.799 + punpcklbw mm1, mm0 ;A F 1.800 + pmaddubsw mm2, mm7 1.801 + pmaddubsw mm1, mm5 1.802 + add rsi, rdx 1.803 + add rax, rdx 1.804 +;-- 1.805 +;-- 1.806 + paddsw mm2, mm3 1.807 + paddsw mm2, mm1 1.808 + paddsw mm2, mm4 1.809 + psraw mm2, 7 1.810 + packuswb mm2, mm2 1.811 + 1.812 + movd DWORD PTR [rdi], mm2 1.813 + 1.814 +%if ABI_IS_32BIT 1.815 + add rdi, DWORD PTR arg(3) ;[out_pitch] 1.816 +%else 1.817 + add rdi, r8 1.818 +%endif 1.819 + dec rcx 1.820 + jnz .vp8_filter_block1d4_v6_ssse3_loop 1.821 + 1.822 + ; begin epilog 1.823 + pop rdi 1.824 + pop rsi 1.825 + RESTORE_GOT 1.826 + UNSHADOW_ARGS 1.827 + pop rbp 1.828 + ret 1.829 + 1.830 +.vp8_filter_block1d4_v4_ssse3: 1.831 + movq mm6, MMWORD PTR [rax+256] ;k2_k4 1.832 + movq mm7, MMWORD PTR [rax+128] ;k1_k3 1.833 + movq mm5, MMWORD PTR [GLOBAL(rd)] 1.834 + 1.835 + mov rsi, arg(0) ;src_ptr 1.836 + 1.837 + mov rax, rsi 1.838 + add rax, rdx 1.839 + 1.840 +.vp8_filter_block1d4_v4_ssse3_loop: 1.841 + movd mm2, DWORD PTR [rsi + rdx] ;B 1.842 + movd mm3, DWORD PTR [rsi + rdx * 2] ;C 1.843 + movd mm4, DWORD PTR [rax + rdx * 2] ;D 1.844 + movd mm0, DWORD PTR [rsi + rdx * 4] ;E 1.845 + 1.846 + punpcklbw mm2, mm4 ;B D 1.847 + punpcklbw mm3, mm0 ;C E 1.848 + 1.849 + pmaddubsw mm3, mm6 1.850 + pmaddubsw mm2, mm7 1.851 + add rsi, rdx 1.852 + add rax, rdx 1.853 +;-- 1.854 +;-- 1.855 + paddsw mm2, mm3 1.856 + paddsw mm2, mm5 1.857 + psraw mm2, 7 1.858 + packuswb mm2, mm2 1.859 + 1.860 + movd DWORD PTR [rdi], mm2 1.861 + 1.862 +%if ABI_IS_32BIT 1.863 + add rdi, DWORD PTR arg(3) ;[out_pitch] 1.864 +%else 1.865 + add rdi, r8 1.866 +%endif 1.867 + dec rcx 1.868 + jnz .vp8_filter_block1d4_v4_ssse3_loop 1.869 + 1.870 + ; begin epilog 1.871 + pop rdi 1.872 + pop rsi 1.873 + RESTORE_GOT 1.874 + UNSHADOW_ARGS 1.875 + pop rbp 1.876 + ret 1.877 + 1.878 +;void vp8_bilinear_predict16x16_ssse3 1.879 +;( 1.880 +; unsigned char *src_ptr, 1.881 +; int src_pixels_per_line, 1.882 +; int xoffset, 1.883 +; int yoffset, 1.884 +; unsigned char *dst_ptr, 1.885 +; int dst_pitch 1.886 +;) 1.887 +global sym(vp8_bilinear_predict16x16_ssse3) PRIVATE 1.888 +sym(vp8_bilinear_predict16x16_ssse3): 1.889 + push rbp 1.890 + mov rbp, rsp 1.891 + SHADOW_ARGS_TO_STACK 6 1.892 + SAVE_XMM 7 1.893 + GET_GOT rbx 1.894 + push rsi 1.895 + push rdi 1.896 + ; end prolog 1.897 + 1.898 + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 1.899 + movsxd rax, dword ptr arg(2) ; xoffset 1.900 + 1.901 + cmp rax, 0 ; skip first_pass filter if xoffset=0 1.902 + je .b16x16_sp_only 1.903 + 1.904 + shl rax, 4 1.905 + lea rax, [rax + rcx] ; HFilter 1.906 + 1.907 + mov rdi, arg(4) ; dst_ptr 1.908 + mov rsi, arg(0) ; src_ptr 1.909 + movsxd rdx, dword ptr arg(5) ; dst_pitch 1.910 + 1.911 + movdqa xmm1, [rax] 1.912 + 1.913 + movsxd rax, dword ptr arg(3) ; yoffset 1.914 + 1.915 + cmp rax, 0 ; skip second_pass filter if yoffset=0 1.916 + je .b16x16_fp_only 1.917 + 1.918 + shl rax, 4 1.919 + lea rax, [rax + rcx] ; VFilter 1.920 + 1.921 + lea rcx, [rdi+rdx*8] 1.922 + lea rcx, [rcx+rdx*8] 1.923 + movsxd rdx, dword ptr arg(1) ; src_pixels_per_line 1.924 + 1.925 + movdqa xmm2, [rax] 1.926 + 1.927 +%if ABI_IS_32BIT=0 1.928 + movsxd r8, dword ptr arg(5) ; dst_pitch 1.929 +%endif 1.930 + movq xmm3, [rsi] ; 00 01 02 03 04 05 06 07 1.931 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 1.932 + 1.933 + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 1.934 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 1.935 + 1.936 + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 1.937 + 1.938 + lea rsi, [rsi + rdx] ; next line 1.939 + 1.940 + pmaddubsw xmm3, xmm1 ; 00 02 04 06 08 10 12 14 1.941 + 1.942 + punpcklbw xmm4, xmm5 ; 08 09 09 10 10 11 11 12 12 13 13 14 14 15 15 16 1.943 + pmaddubsw xmm4, xmm1 ; 01 03 05 07 09 11 13 15 1.944 + 1.945 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.946 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.947 + 1.948 + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 1.949 + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 1.950 + 1.951 + movdqa xmm7, xmm3 1.952 + packuswb xmm7, xmm4 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1.953 + 1.954 +.next_row: 1.955 + movq xmm6, [rsi] ; 00 01 02 03 04 05 06 07 1.956 + movq xmm5, [rsi+1] ; 01 02 03 04 05 06 07 08 1.957 + 1.958 + punpcklbw xmm6, xmm5 1.959 + movq xmm4, [rsi+8] ; 08 09 10 11 12 13 14 15 1.960 + 1.961 + movq xmm5, [rsi+9] ; 09 10 11 12 13 14 15 16 1.962 + lea rsi, [rsi + rdx] ; next line 1.963 + 1.964 + pmaddubsw xmm6, xmm1 1.965 + 1.966 + punpcklbw xmm4, xmm5 1.967 + pmaddubsw xmm4, xmm1 1.968 + 1.969 + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 1.970 + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 1.971 + 1.972 + paddw xmm4, [GLOBAL(rd)] ; xmm4 += round value 1.973 + psraw xmm4, VP8_FILTER_SHIFT ; xmm4 /= 128 1.974 + 1.975 + packuswb xmm6, xmm4 1.976 + movdqa xmm5, xmm7 1.977 + 1.978 + punpcklbw xmm5, xmm6 1.979 + pmaddubsw xmm5, xmm2 1.980 + 1.981 + punpckhbw xmm7, xmm6 1.982 + pmaddubsw xmm7, xmm2 1.983 + 1.984 + paddw xmm5, [GLOBAL(rd)] ; xmm5 += round value 1.985 + psraw xmm5, VP8_FILTER_SHIFT ; xmm5 /= 128 1.986 + 1.987 + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 1.988 + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 1.989 + 1.990 + packuswb xmm5, xmm7 1.991 + movdqa xmm7, xmm6 1.992 + 1.993 + movdqa [rdi], xmm5 ; store the results in the destination 1.994 +%if ABI_IS_32BIT 1.995 + add rdi, DWORD PTR arg(5) ; dst_pitch 1.996 +%else 1.997 + add rdi, r8 1.998 +%endif 1.999 + 1.1000 + cmp rdi, rcx 1.1001 + jne .next_row 1.1002 + 1.1003 + jmp .done 1.1004 + 1.1005 +.b16x16_sp_only: 1.1006 + movsxd rax, dword ptr arg(3) ; yoffset 1.1007 + shl rax, 4 1.1008 + lea rax, [rax + rcx] ; VFilter 1.1009 + 1.1010 + mov rdi, arg(4) ; dst_ptr 1.1011 + mov rsi, arg(0) ; src_ptr 1.1012 + movsxd rdx, dword ptr arg(5) ; dst_pitch 1.1013 + 1.1014 + movdqa xmm1, [rax] ; VFilter 1.1015 + 1.1016 + lea rcx, [rdi+rdx*8] 1.1017 + lea rcx, [rcx+rdx*8] 1.1018 + movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1.1019 + 1.1020 + ; get the first horizontal line done 1.1021 + movq xmm4, [rsi] ; load row 0 1.1022 + movq xmm2, [rsi + 8] ; load row 0 1.1023 + 1.1024 + lea rsi, [rsi + rax] ; next line 1.1025 +.next_row_sp: 1.1026 + movq xmm3, [rsi] ; load row + 1 1.1027 + movq xmm5, [rsi + 8] ; load row + 1 1.1028 + 1.1029 + punpcklbw xmm4, xmm3 1.1030 + punpcklbw xmm2, xmm5 1.1031 + 1.1032 + pmaddubsw xmm4, xmm1 1.1033 + movq xmm7, [rsi + rax] ; load row + 2 1.1034 + 1.1035 + pmaddubsw xmm2, xmm1 1.1036 + movq xmm6, [rsi + rax + 8] ; load row + 2 1.1037 + 1.1038 + punpcklbw xmm3, xmm7 1.1039 + punpcklbw xmm5, xmm6 1.1040 + 1.1041 + pmaddubsw xmm3, xmm1 1.1042 + paddw xmm4, [GLOBAL(rd)] 1.1043 + 1.1044 + pmaddubsw xmm5, xmm1 1.1045 + paddw xmm2, [GLOBAL(rd)] 1.1046 + 1.1047 + psraw xmm4, VP8_FILTER_SHIFT 1.1048 + psraw xmm2, VP8_FILTER_SHIFT 1.1049 + 1.1050 + packuswb xmm4, xmm2 1.1051 + paddw xmm3, [GLOBAL(rd)] 1.1052 + 1.1053 + movdqa [rdi], xmm4 ; store row 0 1.1054 + paddw xmm5, [GLOBAL(rd)] 1.1055 + 1.1056 + psraw xmm3, VP8_FILTER_SHIFT 1.1057 + psraw xmm5, VP8_FILTER_SHIFT 1.1058 + 1.1059 + packuswb xmm3, xmm5 1.1060 + movdqa xmm4, xmm7 1.1061 + 1.1062 + movdqa [rdi + rdx],xmm3 ; store row 1 1.1063 + lea rsi, [rsi + 2*rax] 1.1064 + 1.1065 + movdqa xmm2, xmm6 1.1066 + lea rdi, [rdi + 2*rdx] 1.1067 + 1.1068 + cmp rdi, rcx 1.1069 + jne .next_row_sp 1.1070 + 1.1071 + jmp .done 1.1072 + 1.1073 +.b16x16_fp_only: 1.1074 + lea rcx, [rdi+rdx*8] 1.1075 + lea rcx, [rcx+rdx*8] 1.1076 + movsxd rax, dword ptr arg(1) ; src_pixels_per_line 1.1077 + 1.1078 +.next_row_fp: 1.1079 + movq xmm2, [rsi] ; 00 01 02 03 04 05 06 07 1.1080 + movq xmm4, [rsi+1] ; 01 02 03 04 05 06 07 08 1.1081 + 1.1082 + punpcklbw xmm2, xmm4 1.1083 + movq xmm3, [rsi+8] ; 08 09 10 11 12 13 14 15 1.1084 + 1.1085 + pmaddubsw xmm2, xmm1 1.1086 + movq xmm4, [rsi+9] ; 09 10 11 12 13 14 15 16 1.1087 + 1.1088 + lea rsi, [rsi + rax] ; next line 1.1089 + punpcklbw xmm3, xmm4 1.1090 + 1.1091 + pmaddubsw xmm3, xmm1 1.1092 + movq xmm5, [rsi] 1.1093 + 1.1094 + paddw xmm2, [GLOBAL(rd)] 1.1095 + movq xmm7, [rsi+1] 1.1096 + 1.1097 + movq xmm6, [rsi+8] 1.1098 + psraw xmm2, VP8_FILTER_SHIFT 1.1099 + 1.1100 + punpcklbw xmm5, xmm7 1.1101 + movq xmm7, [rsi+9] 1.1102 + 1.1103 + paddw xmm3, [GLOBAL(rd)] 1.1104 + pmaddubsw xmm5, xmm1 1.1105 + 1.1106 + psraw xmm3, VP8_FILTER_SHIFT 1.1107 + punpcklbw xmm6, xmm7 1.1108 + 1.1109 + packuswb xmm2, xmm3 1.1110 + pmaddubsw xmm6, xmm1 1.1111 + 1.1112 + movdqa [rdi], xmm2 ; store the results in the destination 1.1113 + paddw xmm5, [GLOBAL(rd)] 1.1114 + 1.1115 + lea rdi, [rdi + rdx] ; dst_pitch 1.1116 + psraw xmm5, VP8_FILTER_SHIFT 1.1117 + 1.1118 + paddw xmm6, [GLOBAL(rd)] 1.1119 + psraw xmm6, VP8_FILTER_SHIFT 1.1120 + 1.1121 + packuswb xmm5, xmm6 1.1122 + lea rsi, [rsi + rax] ; next line 1.1123 + 1.1124 + movdqa [rdi], xmm5 ; store the results in the destination 1.1125 + lea rdi, [rdi + rdx] ; dst_pitch 1.1126 + 1.1127 + cmp rdi, rcx 1.1128 + 1.1129 + jne .next_row_fp 1.1130 + 1.1131 +.done: 1.1132 + ; begin epilog 1.1133 + pop rdi 1.1134 + pop rsi 1.1135 + RESTORE_GOT 1.1136 + RESTORE_XMM 1.1137 + UNSHADOW_ARGS 1.1138 + pop rbp 1.1139 + ret 1.1140 + 1.1141 +;void vp8_bilinear_predict8x8_ssse3 1.1142 +;( 1.1143 +; unsigned char *src_ptr, 1.1144 +; int src_pixels_per_line, 1.1145 +; int xoffset, 1.1146 +; int yoffset, 1.1147 +; unsigned char *dst_ptr, 1.1148 +; int dst_pitch 1.1149 +;) 1.1150 +global sym(vp8_bilinear_predict8x8_ssse3) PRIVATE 1.1151 +sym(vp8_bilinear_predict8x8_ssse3): 1.1152 + push rbp 1.1153 + mov rbp, rsp 1.1154 + SHADOW_ARGS_TO_STACK 6 1.1155 + SAVE_XMM 7 1.1156 + GET_GOT rbx 1.1157 + push rsi 1.1158 + push rdi 1.1159 + ; end prolog 1.1160 + 1.1161 + ALIGN_STACK 16, rax 1.1162 + sub rsp, 144 ; reserve 144 bytes 1.1163 + 1.1164 + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 1.1165 + 1.1166 + mov rsi, arg(0) ;src_ptr 1.1167 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1.1168 + 1.1169 + ;Read 9-line unaligned data in and put them on stack. This gives a big 1.1170 + ;performance boost. 1.1171 + movdqu xmm0, [rsi] 1.1172 + lea rax, [rdx + rdx*2] 1.1173 + movdqu xmm1, [rsi+rdx] 1.1174 + movdqu xmm2, [rsi+rdx*2] 1.1175 + add rsi, rax 1.1176 + movdqu xmm3, [rsi] 1.1177 + movdqu xmm4, [rsi+rdx] 1.1178 + movdqu xmm5, [rsi+rdx*2] 1.1179 + add rsi, rax 1.1180 + movdqu xmm6, [rsi] 1.1181 + movdqu xmm7, [rsi+rdx] 1.1182 + 1.1183 + movdqa XMMWORD PTR [rsp], xmm0 1.1184 + 1.1185 + movdqu xmm0, [rsi+rdx*2] 1.1186 + 1.1187 + movdqa XMMWORD PTR [rsp+16], xmm1 1.1188 + movdqa XMMWORD PTR [rsp+32], xmm2 1.1189 + movdqa XMMWORD PTR [rsp+48], xmm3 1.1190 + movdqa XMMWORD PTR [rsp+64], xmm4 1.1191 + movdqa XMMWORD PTR [rsp+80], xmm5 1.1192 + movdqa XMMWORD PTR [rsp+96], xmm6 1.1193 + movdqa XMMWORD PTR [rsp+112], xmm7 1.1194 + movdqa XMMWORD PTR [rsp+128], xmm0 1.1195 + 1.1196 + movsxd rax, dword ptr arg(2) ; xoffset 1.1197 + cmp rax, 0 ; skip first_pass filter if xoffset=0 1.1198 + je .b8x8_sp_only 1.1199 + 1.1200 + shl rax, 4 1.1201 + add rax, rcx ; HFilter 1.1202 + 1.1203 + mov rdi, arg(4) ; dst_ptr 1.1204 + movsxd rdx, dword ptr arg(5) ; dst_pitch 1.1205 + 1.1206 + movdqa xmm0, [rax] 1.1207 + 1.1208 + movsxd rax, dword ptr arg(3) ; yoffset 1.1209 + cmp rax, 0 ; skip second_pass filter if yoffset=0 1.1210 + je .b8x8_fp_only 1.1211 + 1.1212 + shl rax, 4 1.1213 + lea rax, [rax + rcx] ; VFilter 1.1214 + 1.1215 + lea rcx, [rdi+rdx*8] 1.1216 + 1.1217 + movdqa xmm1, [rax] 1.1218 + 1.1219 + ; get the first horizontal line done 1.1220 + movdqa xmm3, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1.1221 + movdqa xmm5, xmm3 ; 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 xx 1.1222 + 1.1223 + psrldq xmm5, 1 1.1224 + lea rsp, [rsp + 16] ; next line 1.1225 + 1.1226 + punpcklbw xmm3, xmm5 ; 00 01 01 02 02 03 03 04 04 05 05 06 06 07 07 08 1.1227 + pmaddubsw xmm3, xmm0 ; 00 02 04 06 08 10 12 14 1.1228 + 1.1229 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1230 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1231 + 1.1232 + movdqa xmm7, xmm3 1.1233 + packuswb xmm7, xmm7 ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1.1234 + 1.1235 +.next_row: 1.1236 + movdqa xmm6, [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1.1237 + lea rsp, [rsp + 16] ; next line 1.1238 + 1.1239 + movdqa xmm5, xmm6 1.1240 + 1.1241 + psrldq xmm5, 1 1.1242 + 1.1243 + punpcklbw xmm6, xmm5 1.1244 + pmaddubsw xmm6, xmm0 1.1245 + 1.1246 + paddw xmm6, [GLOBAL(rd)] ; xmm6 += round value 1.1247 + psraw xmm6, VP8_FILTER_SHIFT ; xmm6 /= 128 1.1248 + 1.1249 + packuswb xmm6, xmm6 1.1250 + 1.1251 + punpcklbw xmm7, xmm6 1.1252 + pmaddubsw xmm7, xmm1 1.1253 + 1.1254 + paddw xmm7, [GLOBAL(rd)] ; xmm7 += round value 1.1255 + psraw xmm7, VP8_FILTER_SHIFT ; xmm7 /= 128 1.1256 + 1.1257 + packuswb xmm7, xmm7 1.1258 + 1.1259 + movq [rdi], xmm7 ; store the results in the destination 1.1260 + lea rdi, [rdi + rdx] 1.1261 + 1.1262 + movdqa xmm7, xmm6 1.1263 + 1.1264 + cmp rdi, rcx 1.1265 + jne .next_row 1.1266 + 1.1267 + jmp .done8x8 1.1268 + 1.1269 +.b8x8_sp_only: 1.1270 + movsxd rax, dword ptr arg(3) ; yoffset 1.1271 + shl rax, 4 1.1272 + lea rax, [rax + rcx] ; VFilter 1.1273 + 1.1274 + mov rdi, arg(4) ;dst_ptr 1.1275 + movsxd rdx, dword ptr arg(5) ; dst_pitch 1.1276 + 1.1277 + movdqa xmm0, [rax] ; VFilter 1.1278 + 1.1279 + movq xmm1, XMMWORD PTR [rsp] 1.1280 + movq xmm2, XMMWORD PTR [rsp+16] 1.1281 + 1.1282 + movq xmm3, XMMWORD PTR [rsp+32] 1.1283 + punpcklbw xmm1, xmm2 1.1284 + 1.1285 + movq xmm4, XMMWORD PTR [rsp+48] 1.1286 + punpcklbw xmm2, xmm3 1.1287 + 1.1288 + movq xmm5, XMMWORD PTR [rsp+64] 1.1289 + punpcklbw xmm3, xmm4 1.1290 + 1.1291 + movq xmm6, XMMWORD PTR [rsp+80] 1.1292 + punpcklbw xmm4, xmm5 1.1293 + 1.1294 + movq xmm7, XMMWORD PTR [rsp+96] 1.1295 + punpcklbw xmm5, xmm6 1.1296 + 1.1297 + pmaddubsw xmm1, xmm0 1.1298 + pmaddubsw xmm2, xmm0 1.1299 + 1.1300 + pmaddubsw xmm3, xmm0 1.1301 + pmaddubsw xmm4, xmm0 1.1302 + 1.1303 + pmaddubsw xmm5, xmm0 1.1304 + punpcklbw xmm6, xmm7 1.1305 + 1.1306 + pmaddubsw xmm6, xmm0 1.1307 + paddw xmm1, [GLOBAL(rd)] 1.1308 + 1.1309 + paddw xmm2, [GLOBAL(rd)] 1.1310 + psraw xmm1, VP8_FILTER_SHIFT 1.1311 + 1.1312 + paddw xmm3, [GLOBAL(rd)] 1.1313 + psraw xmm2, VP8_FILTER_SHIFT 1.1314 + 1.1315 + paddw xmm4, [GLOBAL(rd)] 1.1316 + psraw xmm3, VP8_FILTER_SHIFT 1.1317 + 1.1318 + paddw xmm5, [GLOBAL(rd)] 1.1319 + psraw xmm4, VP8_FILTER_SHIFT 1.1320 + 1.1321 + paddw xmm6, [GLOBAL(rd)] 1.1322 + psraw xmm5, VP8_FILTER_SHIFT 1.1323 + 1.1324 + psraw xmm6, VP8_FILTER_SHIFT 1.1325 + packuswb xmm1, xmm1 1.1326 + 1.1327 + packuswb xmm2, xmm2 1.1328 + movq [rdi], xmm1 1.1329 + 1.1330 + packuswb xmm3, xmm3 1.1331 + movq [rdi+rdx], xmm2 1.1332 + 1.1333 + packuswb xmm4, xmm4 1.1334 + movq xmm1, XMMWORD PTR [rsp+112] 1.1335 + 1.1336 + lea rdi, [rdi + 2*rdx] 1.1337 + movq xmm2, XMMWORD PTR [rsp+128] 1.1338 + 1.1339 + packuswb xmm5, xmm5 1.1340 + movq [rdi], xmm3 1.1341 + 1.1342 + packuswb xmm6, xmm6 1.1343 + movq [rdi+rdx], xmm4 1.1344 + 1.1345 + lea rdi, [rdi + 2*rdx] 1.1346 + punpcklbw xmm7, xmm1 1.1347 + 1.1348 + movq [rdi], xmm5 1.1349 + pmaddubsw xmm7, xmm0 1.1350 + 1.1351 + movq [rdi+rdx], xmm6 1.1352 + punpcklbw xmm1, xmm2 1.1353 + 1.1354 + pmaddubsw xmm1, xmm0 1.1355 + paddw xmm7, [GLOBAL(rd)] 1.1356 + 1.1357 + psraw xmm7, VP8_FILTER_SHIFT 1.1358 + paddw xmm1, [GLOBAL(rd)] 1.1359 + 1.1360 + psraw xmm1, VP8_FILTER_SHIFT 1.1361 + packuswb xmm7, xmm7 1.1362 + 1.1363 + packuswb xmm1, xmm1 1.1364 + lea rdi, [rdi + 2*rdx] 1.1365 + 1.1366 + movq [rdi], xmm7 1.1367 + 1.1368 + movq [rdi+rdx], xmm1 1.1369 + lea rsp, [rsp + 144] 1.1370 + 1.1371 + jmp .done8x8 1.1372 + 1.1373 +.b8x8_fp_only: 1.1374 + lea rcx, [rdi+rdx*8] 1.1375 + 1.1376 +.next_row_fp: 1.1377 + movdqa xmm1, XMMWORD PTR [rsp] 1.1378 + movdqa xmm3, XMMWORD PTR [rsp+16] 1.1379 + 1.1380 + movdqa xmm2, xmm1 1.1381 + movdqa xmm5, XMMWORD PTR [rsp+32] 1.1382 + 1.1383 + psrldq xmm2, 1 1.1384 + movdqa xmm7, XMMWORD PTR [rsp+48] 1.1385 + 1.1386 + movdqa xmm4, xmm3 1.1387 + psrldq xmm4, 1 1.1388 + 1.1389 + movdqa xmm6, xmm5 1.1390 + psrldq xmm6, 1 1.1391 + 1.1392 + punpcklbw xmm1, xmm2 1.1393 + pmaddubsw xmm1, xmm0 1.1394 + 1.1395 + punpcklbw xmm3, xmm4 1.1396 + pmaddubsw xmm3, xmm0 1.1397 + 1.1398 + punpcklbw xmm5, xmm6 1.1399 + pmaddubsw xmm5, xmm0 1.1400 + 1.1401 + movdqa xmm2, xmm7 1.1402 + psrldq xmm2, 1 1.1403 + 1.1404 + punpcklbw xmm7, xmm2 1.1405 + pmaddubsw xmm7, xmm0 1.1406 + 1.1407 + paddw xmm1, [GLOBAL(rd)] 1.1408 + psraw xmm1, VP8_FILTER_SHIFT 1.1409 + 1.1410 + paddw xmm3, [GLOBAL(rd)] 1.1411 + psraw xmm3, VP8_FILTER_SHIFT 1.1412 + 1.1413 + paddw xmm5, [GLOBAL(rd)] 1.1414 + psraw xmm5, VP8_FILTER_SHIFT 1.1415 + 1.1416 + paddw xmm7, [GLOBAL(rd)] 1.1417 + psraw xmm7, VP8_FILTER_SHIFT 1.1418 + 1.1419 + packuswb xmm1, xmm1 1.1420 + packuswb xmm3, xmm3 1.1421 + 1.1422 + packuswb xmm5, xmm5 1.1423 + movq [rdi], xmm1 1.1424 + 1.1425 + packuswb xmm7, xmm7 1.1426 + movq [rdi+rdx], xmm3 1.1427 + 1.1428 + lea rdi, [rdi + 2*rdx] 1.1429 + movq [rdi], xmm5 1.1430 + 1.1431 + lea rsp, [rsp + 4*16] 1.1432 + movq [rdi+rdx], xmm7 1.1433 + 1.1434 + lea rdi, [rdi + 2*rdx] 1.1435 + cmp rdi, rcx 1.1436 + 1.1437 + jne .next_row_fp 1.1438 + 1.1439 + lea rsp, [rsp + 16] 1.1440 + 1.1441 +.done8x8: 1.1442 + ;add rsp, 144 1.1443 + pop rsp 1.1444 + ; begin epilog 1.1445 + pop rdi 1.1446 + pop rsi 1.1447 + RESTORE_GOT 1.1448 + RESTORE_XMM 1.1449 + UNSHADOW_ARGS 1.1450 + pop rbp 1.1451 + ret 1.1452 + 1.1453 +SECTION_RODATA 1.1454 +align 16 1.1455 +shuf1b: 1.1456 + db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 1.1457 +shuf2b: 1.1458 + db 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11 1.1459 +shuf3b: 1.1460 + db 1, 3, 2, 4, 3, 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10 1.1461 + 1.1462 +align 16 1.1463 +shuf2bfrom1: 1.1464 + db 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11, 9,13 1.1465 +align 16 1.1466 +shuf3bfrom1: 1.1467 + db 2, 6, 4, 8, 6, 1, 8, 3, 1, 5, 3, 7, 5, 9, 7,11 1.1468 + 1.1469 +align 16 1.1470 +rd: 1.1471 + times 8 dw 0x40 1.1472 + 1.1473 +align 16 1.1474 +k0_k5: 1.1475 + times 8 db 0, 0 ;placeholder 1.1476 + times 8 db 0, 0 1.1477 + times 8 db 2, 1 1.1478 + times 8 db 0, 0 1.1479 + times 8 db 3, 3 1.1480 + times 8 db 0, 0 1.1481 + times 8 db 1, 2 1.1482 + times 8 db 0, 0 1.1483 +k1_k3: 1.1484 + times 8 db 0, 0 ;placeholder 1.1485 + times 8 db -6, 12 1.1486 + times 8 db -11, 36 1.1487 + times 8 db -9, 50 1.1488 + times 8 db -16, 77 1.1489 + times 8 db -6, 93 1.1490 + times 8 db -8, 108 1.1491 + times 8 db -1, 123 1.1492 +k2_k4: 1.1493 + times 8 db 128, 0 ;placeholder 1.1494 + times 8 db 123, -1 1.1495 + times 8 db 108, -8 1.1496 + times 8 db 93, -6 1.1497 + times 8 db 77, -16 1.1498 + times 8 db 50, -9 1.1499 + times 8 db 36, -11 1.1500 + times 8 db 12, -6 1.1501 +align 16 1.1502 +vp8_bilinear_filters_ssse3: 1.1503 + times 8 db 128, 0 1.1504 + times 8 db 112, 16 1.1505 + times 8 db 96, 32 1.1506 + times 8 db 80, 48 1.1507 + times 8 db 64, 64 1.1508 + times 8 db 48, 80 1.1509 + times 8 db 32, 96 1.1510 + times 8 db 16, 112 1.1511 +