1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/subpixel_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,1372 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 +extern sym(vp8_bilinear_filters_x86_8) 1.17 + 1.18 +%define BLOCK_HEIGHT_WIDTH 4 1.19 +%define VP8_FILTER_WEIGHT 128 1.20 +%define VP8_FILTER_SHIFT 7 1.21 + 1.22 + 1.23 +;/************************************************************************************ 1.24 +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 1.25 +; input pixel array has output_height rows. This routine assumes that output_height is an 1.26 +; even number. This function handles 8 pixels in horizontal direction, calculating ONE 1.27 +; rows each iteration to take advantage of the 128 bits operations. 1.28 +;*************************************************************************************/ 1.29 +;void vp8_filter_block1d8_h6_sse2 1.30 +;( 1.31 +; unsigned char *src_ptr, 1.32 +; unsigned short *output_ptr, 1.33 +; unsigned int src_pixels_per_line, 1.34 +; unsigned int pixel_step, 1.35 +; unsigned int output_height, 1.36 +; unsigned int output_width, 1.37 +; short *vp8_filter 1.38 +;) 1.39 +global sym(vp8_filter_block1d8_h6_sse2) PRIVATE 1.40 +sym(vp8_filter_block1d8_h6_sse2): 1.41 + push rbp 1.42 + mov rbp, rsp 1.43 + SHADOW_ARGS_TO_STACK 7 1.44 + SAVE_XMM 7 1.45 + GET_GOT rbx 1.46 + push rsi 1.47 + push rdi 1.48 + ; end prolog 1.49 + 1.50 + mov rdx, arg(6) ;vp8_filter 1.51 + mov rsi, arg(0) ;src_ptr 1.52 + 1.53 + mov rdi, arg(1) ;output_ptr 1.54 + 1.55 + movsxd rcx, dword ptr arg(4) ;output_height 1.56 + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 1.57 +%if ABI_IS_32BIT=0 1.58 + movsxd r8, dword ptr arg(5) ;output_width 1.59 +%endif 1.60 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.61 + 1.62 +.filter_block1d8_h6_rowloop: 1.63 + movq xmm3, MMWORD PTR [rsi - 2] 1.64 + movq xmm1, MMWORD PTR [rsi + 6] 1.65 + 1.66 + prefetcht2 [rsi+rax-2] 1.67 + 1.68 + pslldq xmm1, 8 1.69 + por xmm1, xmm3 1.70 + 1.71 + movdqa xmm4, xmm1 1.72 + movdqa xmm5, xmm1 1.73 + 1.74 + movdqa xmm6, xmm1 1.75 + movdqa xmm7, xmm1 1.76 + 1.77 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.78 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 1.79 + 1.80 + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 1.81 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 1.82 + 1.83 + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 1.84 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 1.85 + 1.86 + 1.87 + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 1.88 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 1.89 + 1.90 + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 1.91 + 1.92 + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 1.93 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 1.94 + 1.95 + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 1.96 + 1.97 + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 1.98 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 1.99 + 1.100 + 1.101 + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 1.102 + 1.103 + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 1.104 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 1.105 + 1.106 + 1.107 + paddsw xmm4, xmm7 1.108 + paddsw xmm4, xmm5 1.109 + 1.110 + paddsw xmm4, xmm3 1.111 + paddsw xmm4, xmm6 1.112 + 1.113 + paddsw xmm4, xmm1 1.114 + paddsw xmm4, [GLOBAL(rd)] 1.115 + 1.116 + psraw xmm4, 7 1.117 + 1.118 + packuswb xmm4, xmm0 1.119 + punpcklbw xmm4, xmm0 1.120 + 1.121 + movdqa XMMWORD Ptr [rdi], xmm4 1.122 + lea rsi, [rsi + rax] 1.123 + 1.124 +%if ABI_IS_32BIT 1.125 + add rdi, DWORD Ptr arg(5) ;[output_width] 1.126 +%else 1.127 + add rdi, r8 1.128 +%endif 1.129 + dec rcx 1.130 + 1.131 + jnz .filter_block1d8_h6_rowloop ; next row 1.132 + 1.133 + ; begin epilog 1.134 + pop rdi 1.135 + pop rsi 1.136 + RESTORE_GOT 1.137 + RESTORE_XMM 1.138 + UNSHADOW_ARGS 1.139 + pop rbp 1.140 + ret 1.141 + 1.142 + 1.143 +;void vp8_filter_block1d16_h6_sse2 1.144 +;( 1.145 +; unsigned char *src_ptr, 1.146 +; unsigned short *output_ptr, 1.147 +; unsigned int src_pixels_per_line, 1.148 +; unsigned int pixel_step, 1.149 +; unsigned int output_height, 1.150 +; unsigned int output_width, 1.151 +; short *vp8_filter 1.152 +;) 1.153 +;/************************************************************************************ 1.154 +; Notes: filter_block1d_h6 applies a 6 tap filter horizontally to the input pixels. The 1.155 +; input pixel array has output_height rows. This routine assumes that output_height is an 1.156 +; even number. This function handles 8 pixels in horizontal direction, calculating ONE 1.157 +; rows each iteration to take advantage of the 128 bits operations. 1.158 +;*************************************************************************************/ 1.159 +global sym(vp8_filter_block1d16_h6_sse2) PRIVATE 1.160 +sym(vp8_filter_block1d16_h6_sse2): 1.161 + push rbp 1.162 + mov rbp, rsp 1.163 + SHADOW_ARGS_TO_STACK 7 1.164 + SAVE_XMM 7 1.165 + GET_GOT rbx 1.166 + push rsi 1.167 + push rdi 1.168 + ; end prolog 1.169 + 1.170 + mov rdx, arg(6) ;vp8_filter 1.171 + mov rsi, arg(0) ;src_ptr 1.172 + 1.173 + mov rdi, arg(1) ;output_ptr 1.174 + 1.175 + movsxd rcx, dword ptr arg(4) ;output_height 1.176 + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 1.177 +%if ABI_IS_32BIT=0 1.178 + movsxd r8, dword ptr arg(5) ;output_width 1.179 +%endif 1.180 + 1.181 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.182 + 1.183 +.filter_block1d16_h6_sse2_rowloop: 1.184 + movq xmm3, MMWORD PTR [rsi - 2] 1.185 + movq xmm1, MMWORD PTR [rsi + 6] 1.186 + 1.187 + movq xmm2, MMWORD PTR [rsi +14] 1.188 + pslldq xmm2, 8 1.189 + 1.190 + por xmm2, xmm1 1.191 + prefetcht2 [rsi+rax-2] 1.192 + 1.193 + pslldq xmm1, 8 1.194 + por xmm1, xmm3 1.195 + 1.196 + movdqa xmm4, xmm1 1.197 + movdqa xmm5, xmm1 1.198 + 1.199 + movdqa xmm6, xmm1 1.200 + movdqa xmm7, xmm1 1.201 + 1.202 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.203 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 1.204 + 1.205 + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 1.206 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 1.207 + 1.208 + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 1.209 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 1.210 + 1.211 + 1.212 + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 1.213 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 1.214 + 1.215 + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 1.216 + 1.217 + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 1.218 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 1.219 + 1.220 + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 1.221 + 1.222 + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 1.223 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 1.224 + 1.225 + 1.226 + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 1.227 + 1.228 + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 1.229 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 1.230 + 1.231 + paddsw xmm4, xmm7 1.232 + paddsw xmm4, xmm5 1.233 + 1.234 + paddsw xmm4, xmm3 1.235 + paddsw xmm4, xmm6 1.236 + 1.237 + paddsw xmm4, xmm1 1.238 + paddsw xmm4, [GLOBAL(rd)] 1.239 + 1.240 + psraw xmm4, 7 1.241 + 1.242 + packuswb xmm4, xmm0 1.243 + punpcklbw xmm4, xmm0 1.244 + 1.245 + movdqa XMMWORD Ptr [rdi], xmm4 1.246 + 1.247 + movdqa xmm3, xmm2 1.248 + movdqa xmm4, xmm2 1.249 + 1.250 + movdqa xmm5, xmm2 1.251 + movdqa xmm6, xmm2 1.252 + 1.253 + movdqa xmm7, xmm2 1.254 + 1.255 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.256 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 1.257 + 1.258 + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 1.259 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 1.260 + 1.261 + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 1.262 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 1.263 + 1.264 + 1.265 + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 1.266 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 1.267 + 1.268 + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 1.269 + 1.270 + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 1.271 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 1.272 + 1.273 + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 1.274 + 1.275 + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 1.276 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 1.277 + 1.278 + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 1.279 + 1.280 + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 1.281 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 1.282 + 1.283 + 1.284 + paddsw xmm4, xmm7 1.285 + paddsw xmm4, xmm5 1.286 + 1.287 + paddsw xmm4, xmm3 1.288 + paddsw xmm4, xmm6 1.289 + 1.290 + paddsw xmm4, xmm2 1.291 + paddsw xmm4, [GLOBAL(rd)] 1.292 + 1.293 + psraw xmm4, 7 1.294 + 1.295 + packuswb xmm4, xmm0 1.296 + punpcklbw xmm4, xmm0 1.297 + 1.298 + movdqa XMMWORD Ptr [rdi+16], xmm4 1.299 + 1.300 + lea rsi, [rsi + rax] 1.301 +%if ABI_IS_32BIT 1.302 + add rdi, DWORD Ptr arg(5) ;[output_width] 1.303 +%else 1.304 + add rdi, r8 1.305 +%endif 1.306 + 1.307 + dec rcx 1.308 + jnz .filter_block1d16_h6_sse2_rowloop ; next row 1.309 + 1.310 + ; begin epilog 1.311 + pop rdi 1.312 + pop rsi 1.313 + RESTORE_GOT 1.314 + RESTORE_XMM 1.315 + UNSHADOW_ARGS 1.316 + pop rbp 1.317 + ret 1.318 + 1.319 + 1.320 +;void vp8_filter_block1d8_v6_sse2 1.321 +;( 1.322 +; short *src_ptr, 1.323 +; unsigned char *output_ptr, 1.324 +; int dst_ptich, 1.325 +; unsigned int pixels_per_line, 1.326 +; unsigned int pixel_step, 1.327 +; unsigned int output_height, 1.328 +; unsigned int output_width, 1.329 +; short * vp8_filter 1.330 +;) 1.331 +;/************************************************************************************ 1.332 +; Notes: filter_block1d8_v6 applies a 6 tap filter vertically to the input pixels. The 1.333 +; input pixel array has output_height rows. 1.334 +;*************************************************************************************/ 1.335 +global sym(vp8_filter_block1d8_v6_sse2) PRIVATE 1.336 +sym(vp8_filter_block1d8_v6_sse2): 1.337 + push rbp 1.338 + mov rbp, rsp 1.339 + SHADOW_ARGS_TO_STACK 8 1.340 + SAVE_XMM 7 1.341 + GET_GOT rbx 1.342 + push rsi 1.343 + push rdi 1.344 + ; end prolog 1.345 + 1.346 + mov rax, arg(7) ;vp8_filter 1.347 + movsxd rdx, dword ptr arg(3) ;pixels_per_line 1.348 + 1.349 + mov rdi, arg(1) ;output_ptr 1.350 + mov rsi, arg(0) ;src_ptr 1.351 + 1.352 + sub rsi, rdx 1.353 + sub rsi, rdx 1.354 + 1.355 + movsxd rcx, DWORD PTR arg(5) ;[output_height] 1.356 + pxor xmm0, xmm0 ; clear xmm0 1.357 + 1.358 + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 1.359 +%if ABI_IS_32BIT=0 1.360 + movsxd r8, dword ptr arg(2) ; dst_ptich 1.361 +%endif 1.362 + 1.363 +.vp8_filter_block1d8_v6_sse2_loop: 1.364 + movdqa xmm1, XMMWORD PTR [rsi] 1.365 + pmullw xmm1, [rax] 1.366 + 1.367 + movdqa xmm2, XMMWORD PTR [rsi + rdx] 1.368 + pmullw xmm2, [rax + 16] 1.369 + 1.370 + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] 1.371 + pmullw xmm3, [rax + 32] 1.372 + 1.373 + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] 1.374 + pmullw xmm5, [rax + 64] 1.375 + 1.376 + add rsi, rdx 1.377 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2] 1.378 + 1.379 + pmullw xmm4, [rax + 48] 1.380 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4] 1.381 + 1.382 + pmullw xmm6, [rax + 80] 1.383 + 1.384 + paddsw xmm2, xmm5 1.385 + paddsw xmm2, xmm3 1.386 + 1.387 + paddsw xmm2, xmm1 1.388 + paddsw xmm2, xmm4 1.389 + 1.390 + paddsw xmm2, xmm6 1.391 + paddsw xmm2, xmm7 1.392 + 1.393 + psraw xmm2, 7 1.394 + packuswb xmm2, xmm0 ; pack and saturate 1.395 + 1.396 + movq QWORD PTR [rdi], xmm2 ; store the results in the destination 1.397 +%if ABI_IS_32BIT 1.398 + add rdi, DWORD PTR arg(2) ;[dst_ptich] 1.399 +%else 1.400 + add rdi, r8 1.401 +%endif 1.402 + dec rcx ; decrement count 1.403 + jnz .vp8_filter_block1d8_v6_sse2_loop ; next row 1.404 + 1.405 + ; begin epilog 1.406 + pop rdi 1.407 + pop rsi 1.408 + RESTORE_GOT 1.409 + RESTORE_XMM 1.410 + UNSHADOW_ARGS 1.411 + pop rbp 1.412 + ret 1.413 + 1.414 + 1.415 +;void vp8_filter_block1d16_v6_sse2 1.416 +;( 1.417 +; unsigned short *src_ptr, 1.418 +; unsigned char *output_ptr, 1.419 +; int dst_ptich, 1.420 +; unsigned int pixels_per_line, 1.421 +; unsigned int pixel_step, 1.422 +; unsigned int output_height, 1.423 +; unsigned int output_width, 1.424 +; const short *vp8_filter 1.425 +;) 1.426 +;/************************************************************************************ 1.427 +; Notes: filter_block1d16_v6 applies a 6 tap filter vertically to the input pixels. The 1.428 +; input pixel array has output_height rows. 1.429 +;*************************************************************************************/ 1.430 +global sym(vp8_filter_block1d16_v6_sse2) PRIVATE 1.431 +sym(vp8_filter_block1d16_v6_sse2): 1.432 + push rbp 1.433 + mov rbp, rsp 1.434 + SHADOW_ARGS_TO_STACK 8 1.435 + SAVE_XMM 7 1.436 + GET_GOT rbx 1.437 + push rsi 1.438 + push rdi 1.439 + ; end prolog 1.440 + 1.441 + mov rax, arg(7) ;vp8_filter 1.442 + movsxd rdx, dword ptr arg(3) ;pixels_per_line 1.443 + 1.444 + mov rdi, arg(1) ;output_ptr 1.445 + mov rsi, arg(0) ;src_ptr 1.446 + 1.447 + sub rsi, rdx 1.448 + sub rsi, rdx 1.449 + 1.450 + movsxd rcx, DWORD PTR arg(5) ;[output_height] 1.451 +%if ABI_IS_32BIT=0 1.452 + movsxd r8, dword ptr arg(2) ; dst_ptich 1.453 +%endif 1.454 + 1.455 +.vp8_filter_block1d16_v6_sse2_loop: 1.456 +; The order for adding 6-tap is 2 5 3 1 4 6. Read in data in that order. 1.457 + movdqa xmm1, XMMWORD PTR [rsi + rdx] ; line 2 1.458 + movdqa xmm2, XMMWORD PTR [rsi + rdx + 16] 1.459 + pmullw xmm1, [rax + 16] 1.460 + pmullw xmm2, [rax + 16] 1.461 + 1.462 + movdqa xmm3, XMMWORD PTR [rsi + rdx * 4] ; line 5 1.463 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 4 + 16] 1.464 + pmullw xmm3, [rax + 64] 1.465 + pmullw xmm4, [rax + 64] 1.466 + 1.467 + movdqa xmm5, XMMWORD PTR [rsi + rdx * 2] ; line 3 1.468 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 2 + 16] 1.469 + pmullw xmm5, [rax + 32] 1.470 + pmullw xmm6, [rax + 32] 1.471 + 1.472 + movdqa xmm7, XMMWORD PTR [rsi] ; line 1 1.473 + movdqa xmm0, XMMWORD PTR [rsi + 16] 1.474 + pmullw xmm7, [rax] 1.475 + pmullw xmm0, [rax] 1.476 + 1.477 + paddsw xmm1, xmm3 1.478 + paddsw xmm2, xmm4 1.479 + paddsw xmm1, xmm5 1.480 + paddsw xmm2, xmm6 1.481 + paddsw xmm1, xmm7 1.482 + paddsw xmm2, xmm0 1.483 + 1.484 + add rsi, rdx 1.485 + 1.486 + movdqa xmm3, XMMWORD PTR [rsi + rdx * 2] ; line 4 1.487 + movdqa xmm4, XMMWORD PTR [rsi + rdx * 2 + 16] 1.488 + pmullw xmm3, [rax + 48] 1.489 + pmullw xmm4, [rax + 48] 1.490 + 1.491 + movdqa xmm5, XMMWORD PTR [rsi + rdx * 4] ; line 6 1.492 + movdqa xmm6, XMMWORD PTR [rsi + rdx * 4 + 16] 1.493 + pmullw xmm5, [rax + 80] 1.494 + pmullw xmm6, [rax + 80] 1.495 + 1.496 + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 1.497 + pxor xmm0, xmm0 ; clear xmm0 1.498 + 1.499 + paddsw xmm1, xmm3 1.500 + paddsw xmm2, xmm4 1.501 + paddsw xmm1, xmm5 1.502 + paddsw xmm2, xmm6 1.503 + 1.504 + paddsw xmm1, xmm7 1.505 + paddsw xmm2, xmm7 1.506 + 1.507 + psraw xmm1, 7 1.508 + psraw xmm2, 7 1.509 + 1.510 + packuswb xmm1, xmm2 ; pack and saturate 1.511 + movdqa XMMWORD PTR [rdi], xmm1 ; store the results in the destination 1.512 +%if ABI_IS_32BIT 1.513 + add rdi, DWORD PTR arg(2) ;[dst_ptich] 1.514 +%else 1.515 + add rdi, r8 1.516 +%endif 1.517 + dec rcx ; decrement count 1.518 + jnz .vp8_filter_block1d16_v6_sse2_loop ; next row 1.519 + 1.520 + ; begin epilog 1.521 + pop rdi 1.522 + pop rsi 1.523 + RESTORE_GOT 1.524 + RESTORE_XMM 1.525 + UNSHADOW_ARGS 1.526 + pop rbp 1.527 + ret 1.528 + 1.529 + 1.530 +;void vp8_filter_block1d8_h6_only_sse2 1.531 +;( 1.532 +; unsigned char *src_ptr, 1.533 +; unsigned int src_pixels_per_line, 1.534 +; unsigned char *output_ptr, 1.535 +; int dst_ptich, 1.536 +; unsigned int output_height, 1.537 +; const short *vp8_filter 1.538 +;) 1.539 +; First-pass filter only when yoffset==0 1.540 +global sym(vp8_filter_block1d8_h6_only_sse2) PRIVATE 1.541 +sym(vp8_filter_block1d8_h6_only_sse2): 1.542 + push rbp 1.543 + mov rbp, rsp 1.544 + SHADOW_ARGS_TO_STACK 6 1.545 + SAVE_XMM 7 1.546 + GET_GOT rbx 1.547 + push rsi 1.548 + push rdi 1.549 + ; end prolog 1.550 + 1.551 + mov rdx, arg(5) ;vp8_filter 1.552 + mov rsi, arg(0) ;src_ptr 1.553 + 1.554 + mov rdi, arg(2) ;output_ptr 1.555 + 1.556 + movsxd rcx, dword ptr arg(4) ;output_height 1.557 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 1.558 +%if ABI_IS_32BIT=0 1.559 + movsxd r8, dword ptr arg(3) ;dst_ptich 1.560 +%endif 1.561 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.562 + 1.563 +.filter_block1d8_h6_only_rowloop: 1.564 + movq xmm3, MMWORD PTR [rsi - 2] 1.565 + movq xmm1, MMWORD PTR [rsi + 6] 1.566 + 1.567 + prefetcht2 [rsi+rax-2] 1.568 + 1.569 + pslldq xmm1, 8 1.570 + por xmm1, xmm3 1.571 + 1.572 + movdqa xmm4, xmm1 1.573 + movdqa xmm5, xmm1 1.574 + 1.575 + movdqa xmm6, xmm1 1.576 + movdqa xmm7, xmm1 1.577 + 1.578 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.579 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 1.580 + 1.581 + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 1.582 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 1.583 + 1.584 + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 1.585 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 1.586 + 1.587 + 1.588 + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 1.589 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 1.590 + 1.591 + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 1.592 + 1.593 + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 1.594 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 1.595 + 1.596 + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 1.597 + 1.598 + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 1.599 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 1.600 + 1.601 + 1.602 + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 1.603 + 1.604 + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 1.605 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 1.606 + 1.607 + 1.608 + paddsw xmm4, xmm7 1.609 + paddsw xmm4, xmm5 1.610 + 1.611 + paddsw xmm4, xmm3 1.612 + paddsw xmm4, xmm6 1.613 + 1.614 + paddsw xmm4, xmm1 1.615 + paddsw xmm4, [GLOBAL(rd)] 1.616 + 1.617 + psraw xmm4, 7 1.618 + 1.619 + packuswb xmm4, xmm0 1.620 + 1.621 + movq QWORD PTR [rdi], xmm4 ; store the results in the destination 1.622 + lea rsi, [rsi + rax] 1.623 + 1.624 +%if ABI_IS_32BIT 1.625 + add rdi, DWORD Ptr arg(3) ;dst_ptich 1.626 +%else 1.627 + add rdi, r8 1.628 +%endif 1.629 + dec rcx 1.630 + 1.631 + jnz .filter_block1d8_h6_only_rowloop ; next row 1.632 + 1.633 + ; begin epilog 1.634 + pop rdi 1.635 + pop rsi 1.636 + RESTORE_GOT 1.637 + RESTORE_XMM 1.638 + UNSHADOW_ARGS 1.639 + pop rbp 1.640 + ret 1.641 + 1.642 + 1.643 +;void vp8_filter_block1d16_h6_only_sse2 1.644 +;( 1.645 +; unsigned char *src_ptr, 1.646 +; unsigned int src_pixels_per_line, 1.647 +; unsigned char *output_ptr, 1.648 +; int dst_ptich, 1.649 +; unsigned int output_height, 1.650 +; const short *vp8_filter 1.651 +;) 1.652 +; First-pass filter only when yoffset==0 1.653 +global sym(vp8_filter_block1d16_h6_only_sse2) PRIVATE 1.654 +sym(vp8_filter_block1d16_h6_only_sse2): 1.655 + push rbp 1.656 + mov rbp, rsp 1.657 + SHADOW_ARGS_TO_STACK 6 1.658 + SAVE_XMM 7 1.659 + GET_GOT rbx 1.660 + push rsi 1.661 + push rdi 1.662 + ; end prolog 1.663 + 1.664 + mov rdx, arg(5) ;vp8_filter 1.665 + mov rsi, arg(0) ;src_ptr 1.666 + 1.667 + mov rdi, arg(2) ;output_ptr 1.668 + 1.669 + movsxd rcx, dword ptr arg(4) ;output_height 1.670 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line ; Pitch for Source 1.671 +%if ABI_IS_32BIT=0 1.672 + movsxd r8, dword ptr arg(3) ;dst_ptich 1.673 +%endif 1.674 + 1.675 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.676 + 1.677 +.filter_block1d16_h6_only_sse2_rowloop: 1.678 + movq xmm3, MMWORD PTR [rsi - 2] 1.679 + movq xmm1, MMWORD PTR [rsi + 6] 1.680 + 1.681 + movq xmm2, MMWORD PTR [rsi +14] 1.682 + pslldq xmm2, 8 1.683 + 1.684 + por xmm2, xmm1 1.685 + prefetcht2 [rsi+rax-2] 1.686 + 1.687 + pslldq xmm1, 8 1.688 + por xmm1, xmm3 1.689 + 1.690 + movdqa xmm4, xmm1 1.691 + movdqa xmm5, xmm1 1.692 + 1.693 + movdqa xmm6, xmm1 1.694 + movdqa xmm7, xmm1 1.695 + 1.696 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.697 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 1.698 + 1.699 + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 1.700 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 1.701 + 1.702 + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 1.703 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 1.704 + 1.705 + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 1.706 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 1.707 + 1.708 + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 1.709 + 1.710 + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 1.711 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 1.712 + 1.713 + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 1.714 + 1.715 + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 1.716 + psrldq xmm1, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 1.717 + 1.718 + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 1.719 + 1.720 + punpcklbw xmm1, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 1.721 + pmullw xmm1, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 1.722 + 1.723 + paddsw xmm4, xmm7 1.724 + paddsw xmm4, xmm5 1.725 + 1.726 + paddsw xmm4, xmm3 1.727 + paddsw xmm4, xmm6 1.728 + 1.729 + paddsw xmm4, xmm1 1.730 + paddsw xmm4, [GLOBAL(rd)] 1.731 + 1.732 + psraw xmm4, 7 1.733 + 1.734 + packuswb xmm4, xmm0 ; lower 8 bytes 1.735 + 1.736 + movq QWORD Ptr [rdi], xmm4 ; store the results in the destination 1.737 + 1.738 + movdqa xmm3, xmm2 1.739 + movdqa xmm4, xmm2 1.740 + 1.741 + movdqa xmm5, xmm2 1.742 + movdqa xmm6, xmm2 1.743 + 1.744 + movdqa xmm7, xmm2 1.745 + 1.746 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.747 + psrldq xmm4, 1 ; xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 1.748 + 1.749 + pmullw xmm3, XMMWORD PTR [rdx] ; x[-2] * H[-2]; Tap 1 1.750 + punpcklbw xmm4, xmm0 ; xx06 xx05 xx04 xx03 xx02 xx01 xx00 xx-1 1.751 + 1.752 + psrldq xmm5, 2 ; xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 1.753 + pmullw xmm4, XMMWORD PTR [rdx+16] ; x[-1] * H[-1]; Tap 2 1.754 + 1.755 + punpcklbw xmm5, xmm0 ; xx07 xx06 xx05 xx04 xx03 xx02 xx01 xx00 1.756 + psrldq xmm6, 3 ; xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 1.757 + 1.758 + pmullw xmm5, [rdx+32] ; x[ 0] * H[ 0]; Tap 3 1.759 + 1.760 + punpcklbw xmm6, xmm0 ; xx08 xx07 xx06 xx05 xx04 xx03 xx02 xx01 1.761 + psrldq xmm7, 4 ; xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 02 1.762 + 1.763 + pmullw xmm6, [rdx+48] ; x[ 1] * h[ 1] ; Tap 4 1.764 + 1.765 + punpcklbw xmm7, xmm0 ; xx09 xx08 xx07 xx06 xx05 xx04 xx03 xx02 1.766 + psrldq xmm2, 5 ; xx xx xx xx xx 0d 0c 0b 0a 09 08 07 06 05 04 03 1.767 + 1.768 + pmullw xmm7, [rdx+64] ; x[ 2] * h[ 2] ; Tap 5 1.769 + 1.770 + punpcklbw xmm2, xmm0 ; xx0a xx09 xx08 xx07 xx06 xx05 xx04 xx03 1.771 + pmullw xmm2, [rdx+80] ; x[ 3] * h[ 3] ; Tap 6 1.772 + 1.773 + paddsw xmm4, xmm7 1.774 + paddsw xmm4, xmm5 1.775 + 1.776 + paddsw xmm4, xmm3 1.777 + paddsw xmm4, xmm6 1.778 + 1.779 + paddsw xmm4, xmm2 1.780 + paddsw xmm4, [GLOBAL(rd)] 1.781 + 1.782 + psraw xmm4, 7 1.783 + 1.784 + packuswb xmm4, xmm0 ; higher 8 bytes 1.785 + 1.786 + movq QWORD Ptr [rdi+8], xmm4 ; store the results in the destination 1.787 + 1.788 + lea rsi, [rsi + rax] 1.789 +%if ABI_IS_32BIT 1.790 + add rdi, DWORD Ptr arg(3) ;dst_ptich 1.791 +%else 1.792 + add rdi, r8 1.793 +%endif 1.794 + 1.795 + dec rcx 1.796 + jnz .filter_block1d16_h6_only_sse2_rowloop ; next row 1.797 + 1.798 + ; begin epilog 1.799 + pop rdi 1.800 + pop rsi 1.801 + RESTORE_GOT 1.802 + RESTORE_XMM 1.803 + UNSHADOW_ARGS 1.804 + pop rbp 1.805 + ret 1.806 + 1.807 + 1.808 +;void vp8_filter_block1d8_v6_only_sse2 1.809 +;( 1.810 +; unsigned char *src_ptr, 1.811 +; unsigned int src_pixels_per_line, 1.812 +; unsigned char *output_ptr, 1.813 +; int dst_ptich, 1.814 +; unsigned int output_height, 1.815 +; const short *vp8_filter 1.816 +;) 1.817 +; Second-pass filter only when xoffset==0 1.818 +global sym(vp8_filter_block1d8_v6_only_sse2) PRIVATE 1.819 +sym(vp8_filter_block1d8_v6_only_sse2): 1.820 + push rbp 1.821 + mov rbp, rsp 1.822 + SHADOW_ARGS_TO_STACK 6 1.823 + SAVE_XMM 7 1.824 + GET_GOT rbx 1.825 + push rsi 1.826 + push rdi 1.827 + ; end prolog 1.828 + 1.829 + mov rsi, arg(0) ;src_ptr 1.830 + mov rdi, arg(2) ;output_ptr 1.831 + 1.832 + movsxd rcx, dword ptr arg(4) ;output_height 1.833 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1.834 + 1.835 + mov rax, arg(5) ;vp8_filter 1.836 + 1.837 + pxor xmm0, xmm0 ; clear xmm0 1.838 + 1.839 + movdqa xmm7, XMMWORD PTR [GLOBAL(rd)] 1.840 +%if ABI_IS_32BIT=0 1.841 + movsxd r8, dword ptr arg(3) ; dst_ptich 1.842 +%endif 1.843 + 1.844 +.vp8_filter_block1d8_v6_only_sse2_loop: 1.845 + movq xmm1, MMWORD PTR [rsi] 1.846 + movq xmm2, MMWORD PTR [rsi + rdx] 1.847 + movq xmm3, MMWORD PTR [rsi + rdx * 2] 1.848 + movq xmm5, MMWORD PTR [rsi + rdx * 4] 1.849 + add rsi, rdx 1.850 + movq xmm4, MMWORD PTR [rsi + rdx * 2] 1.851 + movq xmm6, MMWORD PTR [rsi + rdx * 4] 1.852 + 1.853 + punpcklbw xmm1, xmm0 1.854 + pmullw xmm1, [rax] 1.855 + 1.856 + punpcklbw xmm2, xmm0 1.857 + pmullw xmm2, [rax + 16] 1.858 + 1.859 + punpcklbw xmm3, xmm0 1.860 + pmullw xmm3, [rax + 32] 1.861 + 1.862 + punpcklbw xmm5, xmm0 1.863 + pmullw xmm5, [rax + 64] 1.864 + 1.865 + punpcklbw xmm4, xmm0 1.866 + pmullw xmm4, [rax + 48] 1.867 + 1.868 + punpcklbw xmm6, xmm0 1.869 + pmullw xmm6, [rax + 80] 1.870 + 1.871 + paddsw xmm2, xmm5 1.872 + paddsw xmm2, xmm3 1.873 + 1.874 + paddsw xmm2, xmm1 1.875 + paddsw xmm2, xmm4 1.876 + 1.877 + paddsw xmm2, xmm6 1.878 + paddsw xmm2, xmm7 1.879 + 1.880 + psraw xmm2, 7 1.881 + packuswb xmm2, xmm0 ; pack and saturate 1.882 + 1.883 + movq QWORD PTR [rdi], xmm2 ; store the results in the destination 1.884 +%if ABI_IS_32BIT 1.885 + add rdi, DWORD PTR arg(3) ;[dst_ptich] 1.886 +%else 1.887 + add rdi, r8 1.888 +%endif 1.889 + dec rcx ; decrement count 1.890 + jnz .vp8_filter_block1d8_v6_only_sse2_loop ; next row 1.891 + 1.892 + ; begin epilog 1.893 + pop rdi 1.894 + pop rsi 1.895 + RESTORE_GOT 1.896 + RESTORE_XMM 1.897 + UNSHADOW_ARGS 1.898 + pop rbp 1.899 + ret 1.900 + 1.901 + 1.902 +;void vp8_unpack_block1d16_h6_sse2 1.903 +;( 1.904 +; unsigned char *src_ptr, 1.905 +; unsigned short *output_ptr, 1.906 +; unsigned int src_pixels_per_line, 1.907 +; unsigned int output_height, 1.908 +; unsigned int output_width 1.909 +;) 1.910 +global sym(vp8_unpack_block1d16_h6_sse2) PRIVATE 1.911 +sym(vp8_unpack_block1d16_h6_sse2): 1.912 + push rbp 1.913 + mov rbp, rsp 1.914 + SHADOW_ARGS_TO_STACK 5 1.915 + GET_GOT rbx 1.916 + push rsi 1.917 + push rdi 1.918 + ; end prolog 1.919 + 1.920 + mov rsi, arg(0) ;src_ptr 1.921 + mov rdi, arg(1) ;output_ptr 1.922 + 1.923 + movsxd rcx, dword ptr arg(3) ;output_height 1.924 + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; Pitch for Source 1.925 + 1.926 + pxor xmm0, xmm0 ; clear xmm0 for unpack 1.927 +%if ABI_IS_32BIT=0 1.928 + movsxd r8, dword ptr arg(4) ;output_width ; Pitch for Source 1.929 +%endif 1.930 + 1.931 +.unpack_block1d16_h6_sse2_rowloop: 1.932 + movq xmm1, MMWORD PTR [rsi] ; 0d 0c 0b 0a 09 08 07 06 05 04 03 02 01 00 -1 -2 1.933 + movq xmm3, MMWORD PTR [rsi+8] ; make copy of xmm1 1.934 + 1.935 + punpcklbw xmm3, xmm0 ; xx05 xx04 xx03 xx02 xx01 xx01 xx-1 xx-2 1.936 + punpcklbw xmm1, xmm0 1.937 + 1.938 + movdqa XMMWORD Ptr [rdi], xmm1 1.939 + movdqa XMMWORD Ptr [rdi + 16], xmm3 1.940 + 1.941 + lea rsi, [rsi + rax] 1.942 +%if ABI_IS_32BIT 1.943 + add rdi, DWORD Ptr arg(4) ;[output_width] 1.944 +%else 1.945 + add rdi, r8 1.946 +%endif 1.947 + dec rcx 1.948 + jnz .unpack_block1d16_h6_sse2_rowloop ; next row 1.949 + 1.950 + ; begin epilog 1.951 + pop rdi 1.952 + pop rsi 1.953 + RESTORE_GOT 1.954 + UNSHADOW_ARGS 1.955 + pop rbp 1.956 + ret 1.957 + 1.958 + 1.959 +;void vp8_bilinear_predict16x16_sse2 1.960 +;( 1.961 +; unsigned char *src_ptr, 1.962 +; int src_pixels_per_line, 1.963 +; int xoffset, 1.964 +; int yoffset, 1.965 +; unsigned char *dst_ptr, 1.966 +; int dst_pitch 1.967 +;) 1.968 +extern sym(vp8_bilinear_filters_x86_8) 1.969 +global sym(vp8_bilinear_predict16x16_sse2) PRIVATE 1.970 +sym(vp8_bilinear_predict16x16_sse2): 1.971 + push rbp 1.972 + mov rbp, rsp 1.973 + SHADOW_ARGS_TO_STACK 6 1.974 + SAVE_XMM 7 1.975 + GET_GOT rbx 1.976 + push rsi 1.977 + push rdi 1.978 + ; end prolog 1.979 + 1.980 + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] 1.981 + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] 1.982 + 1.983 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1.984 + movsxd rax, dword ptr arg(2) ;xoffset 1.985 + 1.986 + cmp rax, 0 ;skip first_pass filter if xoffset=0 1.987 + je .b16x16_sp_only 1.988 + 1.989 + shl rax, 5 1.990 + add rax, rcx ;HFilter 1.991 + 1.992 + mov rdi, arg(4) ;dst_ptr 1.993 + mov rsi, arg(0) ;src_ptr 1.994 + movsxd rdx, dword ptr arg(5) ;dst_pitch 1.995 + 1.996 + movdqa xmm1, [rax] 1.997 + movdqa xmm2, [rax+16] 1.998 + 1.999 + movsxd rax, dword ptr arg(3) ;yoffset 1.1000 + 1.1001 + cmp rax, 0 ;skip second_pass filter if yoffset=0 1.1002 + je .b16x16_fp_only 1.1003 + 1.1004 + shl rax, 5 1.1005 + add rax, rcx ;VFilter 1.1006 + 1.1007 + lea rcx, [rdi+rdx*8] 1.1008 + lea rcx, [rcx+rdx*8] 1.1009 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1.1010 + 1.1011 + pxor xmm0, xmm0 1.1012 + 1.1013 +%if ABI_IS_32BIT=0 1.1014 + movsxd r8, dword ptr arg(5) ;dst_pitch 1.1015 +%endif 1.1016 + ; get the first horizontal line done 1.1017 + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.1018 + movdqa xmm4, xmm3 ; make a copy of current line 1.1019 + 1.1020 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1.1021 + punpckhbw xmm4, xmm0 1.1022 + 1.1023 + pmullw xmm3, xmm1 1.1024 + pmullw xmm4, xmm1 1.1025 + 1.1026 + movdqu xmm5, [rsi+1] 1.1027 + movdqa xmm6, xmm5 1.1028 + 1.1029 + punpcklbw xmm5, xmm0 1.1030 + punpckhbw xmm6, xmm0 1.1031 + 1.1032 + pmullw xmm5, xmm2 1.1033 + pmullw xmm6, xmm2 1.1034 + 1.1035 + paddw xmm3, xmm5 1.1036 + paddw xmm4, xmm6 1.1037 + 1.1038 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1039 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1040 + 1.1041 + paddw xmm4, [GLOBAL(rd)] 1.1042 + psraw xmm4, VP8_FILTER_SHIFT 1.1043 + 1.1044 + movdqa xmm7, xmm3 1.1045 + packuswb xmm7, xmm4 1.1046 + 1.1047 + add rsi, rdx ; next line 1.1048 +.next_row: 1.1049 + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.1050 + movdqa xmm4, xmm3 ; make a copy of current line 1.1051 + 1.1052 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1.1053 + punpckhbw xmm4, xmm0 1.1054 + 1.1055 + pmullw xmm3, xmm1 1.1056 + pmullw xmm4, xmm1 1.1057 + 1.1058 + movdqu xmm5, [rsi+1] 1.1059 + movdqa xmm6, xmm5 1.1060 + 1.1061 + punpcklbw xmm5, xmm0 1.1062 + punpckhbw xmm6, xmm0 1.1063 + 1.1064 + pmullw xmm5, xmm2 1.1065 + pmullw xmm6, xmm2 1.1066 + 1.1067 + paddw xmm3, xmm5 1.1068 + paddw xmm4, xmm6 1.1069 + 1.1070 + movdqa xmm5, xmm7 1.1071 + movdqa xmm6, xmm7 1.1072 + 1.1073 + punpcklbw xmm5, xmm0 1.1074 + punpckhbw xmm6, xmm0 1.1075 + 1.1076 + pmullw xmm5, [rax] 1.1077 + pmullw xmm6, [rax] 1.1078 + 1.1079 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1080 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1081 + 1.1082 + paddw xmm4, [GLOBAL(rd)] 1.1083 + psraw xmm4, VP8_FILTER_SHIFT 1.1084 + 1.1085 + movdqa xmm7, xmm3 1.1086 + packuswb xmm7, xmm4 1.1087 + 1.1088 + pmullw xmm3, [rax+16] 1.1089 + pmullw xmm4, [rax+16] 1.1090 + 1.1091 + paddw xmm3, xmm5 1.1092 + paddw xmm4, xmm6 1.1093 + 1.1094 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1095 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1096 + 1.1097 + paddw xmm4, [GLOBAL(rd)] 1.1098 + psraw xmm4, VP8_FILTER_SHIFT 1.1099 + 1.1100 + packuswb xmm3, xmm4 1.1101 + movdqa [rdi], xmm3 ; store the results in the destination 1.1102 + 1.1103 + add rsi, rdx ; next line 1.1104 +%if ABI_IS_32BIT 1.1105 + add rdi, DWORD PTR arg(5) ;dst_pitch 1.1106 +%else 1.1107 + add rdi, r8 1.1108 +%endif 1.1109 + 1.1110 + cmp rdi, rcx 1.1111 + jne .next_row 1.1112 + 1.1113 + jmp .done 1.1114 + 1.1115 +.b16x16_sp_only: 1.1116 + movsxd rax, dword ptr arg(3) ;yoffset 1.1117 + shl rax, 5 1.1118 + add rax, rcx ;VFilter 1.1119 + 1.1120 + mov rdi, arg(4) ;dst_ptr 1.1121 + mov rsi, arg(0) ;src_ptr 1.1122 + movsxd rdx, dword ptr arg(5) ;dst_pitch 1.1123 + 1.1124 + movdqa xmm1, [rax] 1.1125 + movdqa xmm2, [rax+16] 1.1126 + 1.1127 + lea rcx, [rdi+rdx*8] 1.1128 + lea rcx, [rcx+rdx*8] 1.1129 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.1130 + 1.1131 + pxor xmm0, xmm0 1.1132 + 1.1133 + ; get the first horizontal line done 1.1134 + movdqu xmm7, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.1135 + 1.1136 + add rsi, rax ; next line 1.1137 +.next_row_spo: 1.1138 + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.1139 + 1.1140 + movdqa xmm5, xmm7 1.1141 + movdqa xmm6, xmm7 1.1142 + 1.1143 + movdqa xmm4, xmm3 ; make a copy of current line 1.1144 + movdqa xmm7, xmm3 1.1145 + 1.1146 + punpcklbw xmm5, xmm0 1.1147 + punpckhbw xmm6, xmm0 1.1148 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1.1149 + punpckhbw xmm4, xmm0 1.1150 + 1.1151 + pmullw xmm5, xmm1 1.1152 + pmullw xmm6, xmm1 1.1153 + pmullw xmm3, xmm2 1.1154 + pmullw xmm4, xmm2 1.1155 + 1.1156 + paddw xmm3, xmm5 1.1157 + paddw xmm4, xmm6 1.1158 + 1.1159 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1160 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1161 + 1.1162 + paddw xmm4, [GLOBAL(rd)] 1.1163 + psraw xmm4, VP8_FILTER_SHIFT 1.1164 + 1.1165 + packuswb xmm3, xmm4 1.1166 + movdqa [rdi], xmm3 ; store the results in the destination 1.1167 + 1.1168 + add rsi, rax ; next line 1.1169 + add rdi, rdx ;dst_pitch 1.1170 + cmp rdi, rcx 1.1171 + jne .next_row_spo 1.1172 + 1.1173 + jmp .done 1.1174 + 1.1175 +.b16x16_fp_only: 1.1176 + lea rcx, [rdi+rdx*8] 1.1177 + lea rcx, [rcx+rdx*8] 1.1178 + movsxd rax, dword ptr arg(1) ;src_pixels_per_line 1.1179 + pxor xmm0, xmm0 1.1180 + 1.1181 +.next_row_fpo: 1.1182 + movdqu xmm3, [rsi] ; xx 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 1.1183 + movdqa xmm4, xmm3 ; make a copy of current line 1.1184 + 1.1185 + punpcklbw xmm3, xmm0 ; xx 00 01 02 03 04 05 06 1.1186 + punpckhbw xmm4, xmm0 1.1187 + 1.1188 + pmullw xmm3, xmm1 1.1189 + pmullw xmm4, xmm1 1.1190 + 1.1191 + movdqu xmm5, [rsi+1] 1.1192 + movdqa xmm6, xmm5 1.1193 + 1.1194 + punpcklbw xmm5, xmm0 1.1195 + punpckhbw xmm6, xmm0 1.1196 + 1.1197 + pmullw xmm5, xmm2 1.1198 + pmullw xmm6, xmm2 1.1199 + 1.1200 + paddw xmm3, xmm5 1.1201 + paddw xmm4, xmm6 1.1202 + 1.1203 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1204 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1205 + 1.1206 + paddw xmm4, [GLOBAL(rd)] 1.1207 + psraw xmm4, VP8_FILTER_SHIFT 1.1208 + 1.1209 + packuswb xmm3, xmm4 1.1210 + movdqa [rdi], xmm3 ; store the results in the destination 1.1211 + 1.1212 + add rsi, rax ; next line 1.1213 + add rdi, rdx ; dst_pitch 1.1214 + cmp rdi, rcx 1.1215 + jne .next_row_fpo 1.1216 + 1.1217 +.done: 1.1218 + ; begin epilog 1.1219 + pop rdi 1.1220 + pop rsi 1.1221 + RESTORE_GOT 1.1222 + RESTORE_XMM 1.1223 + UNSHADOW_ARGS 1.1224 + pop rbp 1.1225 + ret 1.1226 + 1.1227 + 1.1228 +;void vp8_bilinear_predict8x8_sse2 1.1229 +;( 1.1230 +; unsigned char *src_ptr, 1.1231 +; int src_pixels_per_line, 1.1232 +; int xoffset, 1.1233 +; int yoffset, 1.1234 +; unsigned char *dst_ptr, 1.1235 +; int dst_pitch 1.1236 +;) 1.1237 +global sym(vp8_bilinear_predict8x8_sse2) PRIVATE 1.1238 +sym(vp8_bilinear_predict8x8_sse2): 1.1239 + push rbp 1.1240 + mov rbp, rsp 1.1241 + SHADOW_ARGS_TO_STACK 6 1.1242 + SAVE_XMM 7 1.1243 + GET_GOT rbx 1.1244 + push rsi 1.1245 + push rdi 1.1246 + ; end prolog 1.1247 + 1.1248 + ALIGN_STACK 16, rax 1.1249 + sub rsp, 144 ; reserve 144 bytes 1.1250 + 1.1251 + ;const short *HFilter = vp8_bilinear_filters_x86_8[xoffset] 1.1252 + ;const short *VFilter = vp8_bilinear_filters_x86_8[yoffset] 1.1253 + lea rcx, [GLOBAL(sym(vp8_bilinear_filters_x86_8))] 1.1254 + 1.1255 + mov rsi, arg(0) ;src_ptr 1.1256 + movsxd rdx, dword ptr arg(1) ;src_pixels_per_line 1.1257 + 1.1258 + ;Read 9-line unaligned data in and put them on stack. This gives a big 1.1259 + ;performance boost. 1.1260 + movdqu xmm0, [rsi] 1.1261 + lea rax, [rdx + rdx*2] 1.1262 + movdqu xmm1, [rsi+rdx] 1.1263 + movdqu xmm2, [rsi+rdx*2] 1.1264 + add rsi, rax 1.1265 + movdqu xmm3, [rsi] 1.1266 + movdqu xmm4, [rsi+rdx] 1.1267 + movdqu xmm5, [rsi+rdx*2] 1.1268 + add rsi, rax 1.1269 + movdqu xmm6, [rsi] 1.1270 + movdqu xmm7, [rsi+rdx] 1.1271 + 1.1272 + movdqa XMMWORD PTR [rsp], xmm0 1.1273 + 1.1274 + movdqu xmm0, [rsi+rdx*2] 1.1275 + 1.1276 + movdqa XMMWORD PTR [rsp+16], xmm1 1.1277 + movdqa XMMWORD PTR [rsp+32], xmm2 1.1278 + movdqa XMMWORD PTR [rsp+48], xmm3 1.1279 + movdqa XMMWORD PTR [rsp+64], xmm4 1.1280 + movdqa XMMWORD PTR [rsp+80], xmm5 1.1281 + movdqa XMMWORD PTR [rsp+96], xmm6 1.1282 + movdqa XMMWORD PTR [rsp+112], xmm7 1.1283 + movdqa XMMWORD PTR [rsp+128], xmm0 1.1284 + 1.1285 + movsxd rax, dword ptr arg(2) ;xoffset 1.1286 + shl rax, 5 1.1287 + add rax, rcx ;HFilter 1.1288 + 1.1289 + mov rdi, arg(4) ;dst_ptr 1.1290 + movsxd rdx, dword ptr arg(5) ;dst_pitch 1.1291 + 1.1292 + movdqa xmm1, [rax] 1.1293 + movdqa xmm2, [rax+16] 1.1294 + 1.1295 + movsxd rax, dword ptr arg(3) ;yoffset 1.1296 + shl rax, 5 1.1297 + add rax, rcx ;VFilter 1.1298 + 1.1299 + lea rcx, [rdi+rdx*8] 1.1300 + 1.1301 + movdqa xmm5, [rax] 1.1302 + movdqa xmm6, [rax+16] 1.1303 + 1.1304 + pxor xmm0, xmm0 1.1305 + 1.1306 + ; get the first horizontal line done 1.1307 + movdqa xmm3, XMMWORD PTR [rsp] 1.1308 + movdqa xmm4, xmm3 ; make a copy of current line 1.1309 + psrldq xmm4, 1 1.1310 + 1.1311 + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1.1312 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1.1313 + 1.1314 + pmullw xmm3, xmm1 1.1315 + pmullw xmm4, xmm2 1.1316 + 1.1317 + paddw xmm3, xmm4 1.1318 + 1.1319 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1320 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1321 + 1.1322 + movdqa xmm7, xmm3 1.1323 + add rsp, 16 ; next line 1.1324 +.next_row8x8: 1.1325 + movdqa xmm3, XMMWORD PTR [rsp] ; 00 01 02 03 04 05 06 07 08 09 10 11 12 13 14 15 1.1326 + movdqa xmm4, xmm3 ; make a copy of current line 1.1327 + psrldq xmm4, 1 1.1328 + 1.1329 + punpcklbw xmm3, xmm0 ; 00 01 02 03 04 05 06 07 1.1330 + punpcklbw xmm4, xmm0 ; 01 02 03 04 05 06 07 08 1.1331 + 1.1332 + pmullw xmm3, xmm1 1.1333 + pmullw xmm4, xmm2 1.1334 + 1.1335 + paddw xmm3, xmm4 1.1336 + pmullw xmm7, xmm5 1.1337 + 1.1338 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1339 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1340 + 1.1341 + movdqa xmm4, xmm3 1.1342 + 1.1343 + pmullw xmm3, xmm6 1.1344 + paddw xmm3, xmm7 1.1345 + 1.1346 + movdqa xmm7, xmm4 1.1347 + 1.1348 + paddw xmm3, [GLOBAL(rd)] ; xmm3 += round value 1.1349 + psraw xmm3, VP8_FILTER_SHIFT ; xmm3 /= 128 1.1350 + 1.1351 + packuswb xmm3, xmm0 1.1352 + movq [rdi], xmm3 ; store the results in the destination 1.1353 + 1.1354 + add rsp, 16 ; next line 1.1355 + add rdi, rdx 1.1356 + 1.1357 + cmp rdi, rcx 1.1358 + jne .next_row8x8 1.1359 + 1.1360 + ;add rsp, 144 1.1361 + pop rsp 1.1362 + ; begin epilog 1.1363 + pop rdi 1.1364 + pop rsi 1.1365 + RESTORE_GOT 1.1366 + RESTORE_XMM 1.1367 + UNSHADOW_ARGS 1.1368 + pop rbp 1.1369 + ret 1.1370 + 1.1371 + 1.1372 +SECTION_RODATA 1.1373 +align 16 1.1374 +rd: 1.1375 + times 8 dw 0x40