1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_subpixel_8t_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,987 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;Note: tap3 and tap4 have to be applied and added after other taps to avoid 1.18 +;overflow. 1.19 + 1.20 +%macro GET_FILTERS_4 0 1.21 + mov rdx, arg(5) ;filter ptr 1.22 + mov rcx, 0x0400040 1.23 + 1.24 + movdqa xmm7, [rdx] ;load filters 1.25 + pshuflw xmm0, xmm7, 0b ;k0 1.26 + pshuflw xmm1, xmm7, 01010101b ;k1 1.27 + pshuflw xmm2, xmm7, 10101010b ;k2 1.28 + pshuflw xmm3, xmm7, 11111111b ;k3 1.29 + psrldq xmm7, 8 1.30 + pshuflw xmm4, xmm7, 0b ;k4 1.31 + pshuflw xmm5, xmm7, 01010101b ;k5 1.32 + pshuflw xmm6, xmm7, 10101010b ;k6 1.33 + pshuflw xmm7, xmm7, 11111111b ;k7 1.34 + 1.35 + punpcklqdq xmm0, xmm1 1.36 + punpcklqdq xmm2, xmm3 1.37 + punpcklqdq xmm5, xmm4 1.38 + punpcklqdq xmm6, xmm7 1.39 + 1.40 + movdqa k0k1, xmm0 1.41 + movdqa k2k3, xmm2 1.42 + movdqa k5k4, xmm5 1.43 + movdqa k6k7, xmm6 1.44 + 1.45 + movq xmm6, rcx 1.46 + pshufd xmm6, xmm6, 0 1.47 + movdqa krd, xmm6 1.48 + 1.49 + pxor xmm7, xmm7 1.50 + movdqa zero, xmm7 1.51 +%endm 1.52 + 1.53 +%macro APPLY_FILTER_4 1 1.54 + punpckldq xmm0, xmm1 ;two row in one register 1.55 + punpckldq xmm6, xmm7 1.56 + punpckldq xmm2, xmm3 1.57 + punpckldq xmm5, xmm4 1.58 + 1.59 + punpcklbw xmm0, zero ;unpack to word 1.60 + punpcklbw xmm6, zero 1.61 + punpcklbw xmm2, zero 1.62 + punpcklbw xmm5, zero 1.63 + 1.64 + pmullw xmm0, k0k1 ;multiply the filter factors 1.65 + pmullw xmm6, k6k7 1.66 + pmullw xmm2, k2k3 1.67 + pmullw xmm5, k5k4 1.68 + 1.69 + paddsw xmm0, xmm6 ;sum 1.70 + movdqa xmm1, xmm0 1.71 + psrldq xmm1, 8 1.72 + paddsw xmm0, xmm1 1.73 + paddsw xmm0, xmm2 1.74 + psrldq xmm2, 8 1.75 + paddsw xmm0, xmm5 1.76 + psrldq xmm5, 8 1.77 + paddsw xmm0, xmm2 1.78 + paddsw xmm0, xmm5 1.79 + 1.80 + paddsw xmm0, krd ;rounding 1.81 + psraw xmm0, 7 ;shift 1.82 + packuswb xmm0, xmm0 ;pack to byte 1.83 + 1.84 +%if %1 1.85 + movd xmm1, [rdi] 1.86 + pavgb xmm0, xmm1 1.87 +%endif 1.88 + movd [rdi], xmm0 1.89 +%endm 1.90 + 1.91 +%macro GET_FILTERS 0 1.92 + mov rdx, arg(5) ;filter ptr 1.93 + mov rsi, arg(0) ;src_ptr 1.94 + mov rdi, arg(2) ;output_ptr 1.95 + mov rcx, 0x0400040 1.96 + 1.97 + movdqa xmm7, [rdx] ;load filters 1.98 + pshuflw xmm0, xmm7, 0b ;k0 1.99 + pshuflw xmm1, xmm7, 01010101b ;k1 1.100 + pshuflw xmm2, xmm7, 10101010b ;k2 1.101 + pshuflw xmm3, xmm7, 11111111b ;k3 1.102 + pshufhw xmm4, xmm7, 0b ;k4 1.103 + pshufhw xmm5, xmm7, 01010101b ;k5 1.104 + pshufhw xmm6, xmm7, 10101010b ;k6 1.105 + pshufhw xmm7, xmm7, 11111111b ;k7 1.106 + 1.107 + punpcklwd xmm0, xmm0 1.108 + punpcklwd xmm1, xmm1 1.109 + punpcklwd xmm2, xmm2 1.110 + punpcklwd xmm3, xmm3 1.111 + punpckhwd xmm4, xmm4 1.112 + punpckhwd xmm5, xmm5 1.113 + punpckhwd xmm6, xmm6 1.114 + punpckhwd xmm7, xmm7 1.115 + 1.116 + movdqa k0, xmm0 ;store filter factors on stack 1.117 + movdqa k1, xmm1 1.118 + movdqa k2, xmm2 1.119 + movdqa k3, xmm3 1.120 + movdqa k4, xmm4 1.121 + movdqa k5, xmm5 1.122 + movdqa k6, xmm6 1.123 + movdqa k7, xmm7 1.124 + 1.125 + movq xmm6, rcx 1.126 + pshufd xmm6, xmm6, 0 1.127 + movdqa krd, xmm6 ;rounding 1.128 + 1.129 + pxor xmm7, xmm7 1.130 + movdqa zero, xmm7 1.131 +%endm 1.132 + 1.133 +%macro LOAD_VERT_8 1 1.134 + movq xmm0, [rsi + %1] ;0 1.135 + movq xmm1, [rsi + rax + %1] ;1 1.136 + movq xmm6, [rsi + rdx * 2 + %1] ;6 1.137 + lea rsi, [rsi + rax] 1.138 + movq xmm7, [rsi + rdx * 2 + %1] ;7 1.139 + movq xmm2, [rsi + rax + %1] ;2 1.140 + movq xmm3, [rsi + rax * 2 + %1] ;3 1.141 + movq xmm4, [rsi + rdx + %1] ;4 1.142 + movq xmm5, [rsi + rax * 4 + %1] ;5 1.143 +%endm 1.144 + 1.145 +%macro APPLY_FILTER_8 2 1.146 + punpcklbw xmm0, zero 1.147 + punpcklbw xmm1, zero 1.148 + punpcklbw xmm6, zero 1.149 + punpcklbw xmm7, zero 1.150 + punpcklbw xmm2, zero 1.151 + punpcklbw xmm5, zero 1.152 + punpcklbw xmm3, zero 1.153 + punpcklbw xmm4, zero 1.154 + 1.155 + pmullw xmm0, k0 1.156 + pmullw xmm1, k1 1.157 + pmullw xmm6, k6 1.158 + pmullw xmm7, k7 1.159 + pmullw xmm2, k2 1.160 + pmullw xmm5, k5 1.161 + pmullw xmm3, k3 1.162 + pmullw xmm4, k4 1.163 + 1.164 + paddsw xmm0, xmm1 1.165 + paddsw xmm0, xmm6 1.166 + paddsw xmm0, xmm7 1.167 + paddsw xmm0, xmm2 1.168 + paddsw xmm0, xmm5 1.169 + paddsw xmm0, xmm3 1.170 + paddsw xmm0, xmm4 1.171 + 1.172 + paddsw xmm0, krd ;rounding 1.173 + psraw xmm0, 7 ;shift 1.174 + packuswb xmm0, xmm0 ;pack back to byte 1.175 +%if %1 1.176 + movq xmm1, [rdi + %2] 1.177 + pavgb xmm0, xmm1 1.178 +%endif 1.179 + movq [rdi + %2], xmm0 1.180 +%endm 1.181 + 1.182 +;void vp9_filter_block1d4_v8_sse2 1.183 +;( 1.184 +; unsigned char *src_ptr, 1.185 +; unsigned int src_pitch, 1.186 +; unsigned char *output_ptr, 1.187 +; unsigned int out_pitch, 1.188 +; unsigned int output_height, 1.189 +; short *filter 1.190 +;) 1.191 +global sym(vp9_filter_block1d4_v8_sse2) PRIVATE 1.192 +sym(vp9_filter_block1d4_v8_sse2): 1.193 + push rbp 1.194 + mov rbp, rsp 1.195 + SHADOW_ARGS_TO_STACK 6 1.196 + SAVE_XMM 7 1.197 + push rsi 1.198 + push rdi 1.199 + push rbx 1.200 + ; end prolog 1.201 + 1.202 + ALIGN_STACK 16, rax 1.203 + sub rsp, 16 * 6 1.204 + %define k0k1 [rsp + 16 * 0] 1.205 + %define k2k3 [rsp + 16 * 1] 1.206 + %define k5k4 [rsp + 16 * 2] 1.207 + %define k6k7 [rsp + 16 * 3] 1.208 + %define krd [rsp + 16 * 4] 1.209 + %define zero [rsp + 16 * 5] 1.210 + 1.211 + GET_FILTERS_4 1.212 + 1.213 + mov rsi, arg(0) ;src_ptr 1.214 + mov rdi, arg(2) ;output_ptr 1.215 + 1.216 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.217 + movsxd rbx, DWORD PTR arg(3) ;out_pitch 1.218 + lea rdx, [rax + rax * 2] 1.219 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.220 + 1.221 +.loop: 1.222 + movd xmm0, [rsi] ;load src: row 0 1.223 + movd xmm1, [rsi + rax] ;1 1.224 + movd xmm6, [rsi + rdx * 2] ;6 1.225 + lea rsi, [rsi + rax] 1.226 + movd xmm7, [rsi + rdx * 2] ;7 1.227 + movd xmm2, [rsi + rax] ;2 1.228 + movd xmm3, [rsi + rax * 2] ;3 1.229 + movd xmm4, [rsi + rdx] ;4 1.230 + movd xmm5, [rsi + rax * 4] ;5 1.231 + 1.232 + APPLY_FILTER_4 0 1.233 + 1.234 + lea rdi, [rdi + rbx] 1.235 + dec rcx 1.236 + jnz .loop 1.237 + 1.238 + add rsp, 16 * 6 1.239 + pop rsp 1.240 + pop rbx 1.241 + ; begin epilog 1.242 + pop rdi 1.243 + pop rsi 1.244 + RESTORE_XMM 1.245 + UNSHADOW_ARGS 1.246 + pop rbp 1.247 + ret 1.248 + 1.249 +;void vp9_filter_block1d8_v8_sse2 1.250 +;( 1.251 +; unsigned char *src_ptr, 1.252 +; unsigned int src_pitch, 1.253 +; unsigned char *output_ptr, 1.254 +; unsigned int out_pitch, 1.255 +; unsigned int output_height, 1.256 +; short *filter 1.257 +;) 1.258 +global sym(vp9_filter_block1d8_v8_sse2) PRIVATE 1.259 +sym(vp9_filter_block1d8_v8_sse2): 1.260 + push rbp 1.261 + mov rbp, rsp 1.262 + SHADOW_ARGS_TO_STACK 6 1.263 + SAVE_XMM 7 1.264 + push rsi 1.265 + push rdi 1.266 + push rbx 1.267 + ; end prolog 1.268 + 1.269 + ALIGN_STACK 16, rax 1.270 + sub rsp, 16 * 10 1.271 + %define k0 [rsp + 16 * 0] 1.272 + %define k1 [rsp + 16 * 1] 1.273 + %define k2 [rsp + 16 * 2] 1.274 + %define k3 [rsp + 16 * 3] 1.275 + %define k4 [rsp + 16 * 4] 1.276 + %define k5 [rsp + 16 * 5] 1.277 + %define k6 [rsp + 16 * 6] 1.278 + %define k7 [rsp + 16 * 7] 1.279 + %define krd [rsp + 16 * 8] 1.280 + %define zero [rsp + 16 * 9] 1.281 + 1.282 + GET_FILTERS 1.283 + 1.284 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.285 + movsxd rbx, DWORD PTR arg(3) ;out_pitch 1.286 + lea rdx, [rax + rax * 2] 1.287 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.288 + 1.289 +.loop: 1.290 + LOAD_VERT_8 0 1.291 + APPLY_FILTER_8 0, 0 1.292 + 1.293 + lea rdi, [rdi + rbx] 1.294 + dec rcx 1.295 + jnz .loop 1.296 + 1.297 + add rsp, 16 * 10 1.298 + pop rsp 1.299 + pop rbx 1.300 + ; begin epilog 1.301 + pop rdi 1.302 + pop rsi 1.303 + RESTORE_XMM 1.304 + UNSHADOW_ARGS 1.305 + pop rbp 1.306 + ret 1.307 + 1.308 +;void vp9_filter_block1d16_v8_sse2 1.309 +;( 1.310 +; unsigned char *src_ptr, 1.311 +; unsigned int src_pitch, 1.312 +; unsigned char *output_ptr, 1.313 +; unsigned int out_pitch, 1.314 +; unsigned int output_height, 1.315 +; short *filter 1.316 +;) 1.317 +global sym(vp9_filter_block1d16_v8_sse2) PRIVATE 1.318 +sym(vp9_filter_block1d16_v8_sse2): 1.319 + push rbp 1.320 + mov rbp, rsp 1.321 + SHADOW_ARGS_TO_STACK 6 1.322 + SAVE_XMM 7 1.323 + push rsi 1.324 + push rdi 1.325 + push rbx 1.326 + ; end prolog 1.327 + 1.328 + ALIGN_STACK 16, rax 1.329 + sub rsp, 16 * 10 1.330 + %define k0 [rsp + 16 * 0] 1.331 + %define k1 [rsp + 16 * 1] 1.332 + %define k2 [rsp + 16 * 2] 1.333 + %define k3 [rsp + 16 * 3] 1.334 + %define k4 [rsp + 16 * 4] 1.335 + %define k5 [rsp + 16 * 5] 1.336 + %define k6 [rsp + 16 * 6] 1.337 + %define k7 [rsp + 16 * 7] 1.338 + %define krd [rsp + 16 * 8] 1.339 + %define zero [rsp + 16 * 9] 1.340 + 1.341 + GET_FILTERS 1.342 + 1.343 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.344 + movsxd rbx, DWORD PTR arg(3) ;out_pitch 1.345 + lea rdx, [rax + rax * 2] 1.346 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.347 + 1.348 +.loop: 1.349 + LOAD_VERT_8 0 1.350 + APPLY_FILTER_8 0, 0 1.351 + sub rsi, rax 1.352 + 1.353 + LOAD_VERT_8 8 1.354 + APPLY_FILTER_8 0, 8 1.355 + add rdi, rbx 1.356 + 1.357 + dec rcx 1.358 + jnz .loop 1.359 + 1.360 + add rsp, 16 * 10 1.361 + pop rsp 1.362 + pop rbx 1.363 + ; begin epilog 1.364 + pop rdi 1.365 + pop rsi 1.366 + RESTORE_XMM 1.367 + UNSHADOW_ARGS 1.368 + pop rbp 1.369 + ret 1.370 + 1.371 +global sym(vp9_filter_block1d4_v8_avg_sse2) PRIVATE 1.372 +sym(vp9_filter_block1d4_v8_avg_sse2): 1.373 + push rbp 1.374 + mov rbp, rsp 1.375 + SHADOW_ARGS_TO_STACK 6 1.376 + SAVE_XMM 7 1.377 + push rsi 1.378 + push rdi 1.379 + push rbx 1.380 + ; end prolog 1.381 + 1.382 + ALIGN_STACK 16, rax 1.383 + sub rsp, 16 * 6 1.384 + %define k0k1 [rsp + 16 * 0] 1.385 + %define k2k3 [rsp + 16 * 1] 1.386 + %define k5k4 [rsp + 16 * 2] 1.387 + %define k6k7 [rsp + 16 * 3] 1.388 + %define krd [rsp + 16 * 4] 1.389 + %define zero [rsp + 16 * 5] 1.390 + 1.391 + GET_FILTERS_4 1.392 + 1.393 + mov rsi, arg(0) ;src_ptr 1.394 + mov rdi, arg(2) ;output_ptr 1.395 + 1.396 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.397 + movsxd rbx, DWORD PTR arg(3) ;out_pitch 1.398 + lea rdx, [rax + rax * 2] 1.399 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.400 + 1.401 +.loop: 1.402 + movd xmm0, [rsi] ;load src: row 0 1.403 + movd xmm1, [rsi + rax] ;1 1.404 + movd xmm6, [rsi + rdx * 2] ;6 1.405 + lea rsi, [rsi + rax] 1.406 + movd xmm7, [rsi + rdx * 2] ;7 1.407 + movd xmm2, [rsi + rax] ;2 1.408 + movd xmm3, [rsi + rax * 2] ;3 1.409 + movd xmm4, [rsi + rdx] ;4 1.410 + movd xmm5, [rsi + rax * 4] ;5 1.411 + 1.412 + APPLY_FILTER_4 1 1.413 + 1.414 + lea rdi, [rdi + rbx] 1.415 + dec rcx 1.416 + jnz .loop 1.417 + 1.418 + add rsp, 16 * 6 1.419 + pop rsp 1.420 + pop rbx 1.421 + ; begin epilog 1.422 + pop rdi 1.423 + pop rsi 1.424 + RESTORE_XMM 1.425 + UNSHADOW_ARGS 1.426 + pop rbp 1.427 + ret 1.428 + 1.429 +global sym(vp9_filter_block1d8_v8_avg_sse2) PRIVATE 1.430 +sym(vp9_filter_block1d8_v8_avg_sse2): 1.431 + push rbp 1.432 + mov rbp, rsp 1.433 + SHADOW_ARGS_TO_STACK 6 1.434 + SAVE_XMM 7 1.435 + push rsi 1.436 + push rdi 1.437 + push rbx 1.438 + ; end prolog 1.439 + 1.440 + ALIGN_STACK 16, rax 1.441 + sub rsp, 16 * 10 1.442 + %define k0 [rsp + 16 * 0] 1.443 + %define k1 [rsp + 16 * 1] 1.444 + %define k2 [rsp + 16 * 2] 1.445 + %define k3 [rsp + 16 * 3] 1.446 + %define k4 [rsp + 16 * 4] 1.447 + %define k5 [rsp + 16 * 5] 1.448 + %define k6 [rsp + 16 * 6] 1.449 + %define k7 [rsp + 16 * 7] 1.450 + %define krd [rsp + 16 * 8] 1.451 + %define zero [rsp + 16 * 9] 1.452 + 1.453 + GET_FILTERS 1.454 + 1.455 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.456 + movsxd rbx, DWORD PTR arg(3) ;out_pitch 1.457 + lea rdx, [rax + rax * 2] 1.458 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.459 +.loop: 1.460 + LOAD_VERT_8 0 1.461 + APPLY_FILTER_8 1, 0 1.462 + 1.463 + lea rdi, [rdi + rbx] 1.464 + dec rcx 1.465 + jnz .loop 1.466 + 1.467 + add rsp, 16 * 10 1.468 + pop rsp 1.469 + pop rbx 1.470 + ; begin epilog 1.471 + pop rdi 1.472 + pop rsi 1.473 + RESTORE_XMM 1.474 + UNSHADOW_ARGS 1.475 + pop rbp 1.476 + ret 1.477 + 1.478 +global sym(vp9_filter_block1d16_v8_avg_sse2) PRIVATE 1.479 +sym(vp9_filter_block1d16_v8_avg_sse2): 1.480 + push rbp 1.481 + mov rbp, rsp 1.482 + SHADOW_ARGS_TO_STACK 6 1.483 + SAVE_XMM 7 1.484 + push rsi 1.485 + push rdi 1.486 + push rbx 1.487 + ; end prolog 1.488 + 1.489 + ALIGN_STACK 16, rax 1.490 + sub rsp, 16 * 10 1.491 + %define k0 [rsp + 16 * 0] 1.492 + %define k1 [rsp + 16 * 1] 1.493 + %define k2 [rsp + 16 * 2] 1.494 + %define k3 [rsp + 16 * 3] 1.495 + %define k4 [rsp + 16 * 4] 1.496 + %define k5 [rsp + 16 * 5] 1.497 + %define k6 [rsp + 16 * 6] 1.498 + %define k7 [rsp + 16 * 7] 1.499 + %define krd [rsp + 16 * 8] 1.500 + %define zero [rsp + 16 * 9] 1.501 + 1.502 + GET_FILTERS 1.503 + 1.504 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.505 + movsxd rbx, DWORD PTR arg(3) ;out_pitch 1.506 + lea rdx, [rax + rax * 2] 1.507 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.508 +.loop: 1.509 + LOAD_VERT_8 0 1.510 + APPLY_FILTER_8 1, 0 1.511 + sub rsi, rax 1.512 + 1.513 + LOAD_VERT_8 8 1.514 + APPLY_FILTER_8 1, 8 1.515 + add rdi, rbx 1.516 + 1.517 + dec rcx 1.518 + jnz .loop 1.519 + 1.520 + add rsp, 16 * 10 1.521 + pop rsp 1.522 + pop rbx 1.523 + ; begin epilog 1.524 + pop rdi 1.525 + pop rsi 1.526 + RESTORE_XMM 1.527 + UNSHADOW_ARGS 1.528 + pop rbp 1.529 + ret 1.530 + 1.531 +;void vp9_filter_block1d4_h8_sse2 1.532 +;( 1.533 +; unsigned char *src_ptr, 1.534 +; unsigned int src_pixels_per_line, 1.535 +; unsigned char *output_ptr, 1.536 +; unsigned int output_pitch, 1.537 +; unsigned int output_height, 1.538 +; short *filter 1.539 +;) 1.540 +global sym(vp9_filter_block1d4_h8_sse2) PRIVATE 1.541 +sym(vp9_filter_block1d4_h8_sse2): 1.542 + push rbp 1.543 + mov rbp, rsp 1.544 + SHADOW_ARGS_TO_STACK 6 1.545 + SAVE_XMM 7 1.546 + push rsi 1.547 + push rdi 1.548 + ; end prolog 1.549 + 1.550 + ALIGN_STACK 16, rax 1.551 + sub rsp, 16 * 6 1.552 + %define k0k1 [rsp + 16 * 0] 1.553 + %define k2k3 [rsp + 16 * 1] 1.554 + %define k5k4 [rsp + 16 * 2] 1.555 + %define k6k7 [rsp + 16 * 3] 1.556 + %define krd [rsp + 16 * 4] 1.557 + %define zero [rsp + 16 * 5] 1.558 + 1.559 + GET_FILTERS_4 1.560 + 1.561 + mov rsi, arg(0) ;src_ptr 1.562 + mov rdi, arg(2) ;output_ptr 1.563 + 1.564 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.565 + movsxd rdx, DWORD PTR arg(3) ;out_pitch 1.566 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.567 + 1.568 +.loop: 1.569 + movdqu xmm0, [rsi - 3] ;load src 1.570 + 1.571 + movdqa xmm1, xmm0 1.572 + movdqa xmm6, xmm0 1.573 + movdqa xmm7, xmm0 1.574 + movdqa xmm2, xmm0 1.575 + movdqa xmm3, xmm0 1.576 + movdqa xmm5, xmm0 1.577 + movdqa xmm4, xmm0 1.578 + 1.579 + psrldq xmm1, 1 1.580 + psrldq xmm6, 6 1.581 + psrldq xmm7, 7 1.582 + psrldq xmm2, 2 1.583 + psrldq xmm3, 3 1.584 + psrldq xmm5, 5 1.585 + psrldq xmm4, 4 1.586 + 1.587 + APPLY_FILTER_4 0 1.588 + 1.589 + lea rsi, [rsi + rax] 1.590 + lea rdi, [rdi + rdx] 1.591 + dec rcx 1.592 + jnz .loop 1.593 + 1.594 + add rsp, 16 * 6 1.595 + pop rsp 1.596 + 1.597 + ; begin epilog 1.598 + pop rdi 1.599 + pop rsi 1.600 + RESTORE_XMM 1.601 + UNSHADOW_ARGS 1.602 + pop rbp 1.603 + ret 1.604 + 1.605 +;void vp9_filter_block1d8_h8_sse2 1.606 +;( 1.607 +; unsigned char *src_ptr, 1.608 +; unsigned int src_pixels_per_line, 1.609 +; unsigned char *output_ptr, 1.610 +; unsigned int output_pitch, 1.611 +; unsigned int output_height, 1.612 +; short *filter 1.613 +;) 1.614 +global sym(vp9_filter_block1d8_h8_sse2) PRIVATE 1.615 +sym(vp9_filter_block1d8_h8_sse2): 1.616 + push rbp 1.617 + mov rbp, rsp 1.618 + SHADOW_ARGS_TO_STACK 6 1.619 + SAVE_XMM 7 1.620 + push rsi 1.621 + push rdi 1.622 + ; end prolog 1.623 + 1.624 + ALIGN_STACK 16, rax 1.625 + sub rsp, 16 * 10 1.626 + %define k0 [rsp + 16 * 0] 1.627 + %define k1 [rsp + 16 * 1] 1.628 + %define k2 [rsp + 16 * 2] 1.629 + %define k3 [rsp + 16 * 3] 1.630 + %define k4 [rsp + 16 * 4] 1.631 + %define k5 [rsp + 16 * 5] 1.632 + %define k6 [rsp + 16 * 6] 1.633 + %define k7 [rsp + 16 * 7] 1.634 + %define krd [rsp + 16 * 8] 1.635 + %define zero [rsp + 16 * 9] 1.636 + 1.637 + GET_FILTERS 1.638 + 1.639 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.640 + movsxd rdx, DWORD PTR arg(3) ;out_pitch 1.641 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.642 + 1.643 +.loop: 1.644 + movdqu xmm0, [rsi - 3] ;load src 1.645 + 1.646 + movdqa xmm1, xmm0 1.647 + movdqa xmm6, xmm0 1.648 + movdqa xmm7, xmm0 1.649 + movdqa xmm2, xmm0 1.650 + movdqa xmm5, xmm0 1.651 + movdqa xmm3, xmm0 1.652 + movdqa xmm4, xmm0 1.653 + 1.654 + psrldq xmm1, 1 1.655 + psrldq xmm6, 6 1.656 + psrldq xmm7, 7 1.657 + psrldq xmm2, 2 1.658 + psrldq xmm5, 5 1.659 + psrldq xmm3, 3 1.660 + psrldq xmm4, 4 1.661 + 1.662 + APPLY_FILTER_8 0, 0 1.663 + 1.664 + lea rsi, [rsi + rax] 1.665 + lea rdi, [rdi + rdx] 1.666 + dec rcx 1.667 + jnz .loop 1.668 + 1.669 + add rsp, 16 * 10 1.670 + pop rsp 1.671 + 1.672 + ; begin epilog 1.673 + pop rdi 1.674 + pop rsi 1.675 + RESTORE_XMM 1.676 + UNSHADOW_ARGS 1.677 + pop rbp 1.678 + ret 1.679 + 1.680 +;void vp9_filter_block1d16_h8_sse2 1.681 +;( 1.682 +; unsigned char *src_ptr, 1.683 +; unsigned int src_pixels_per_line, 1.684 +; unsigned char *output_ptr, 1.685 +; unsigned int output_pitch, 1.686 +; unsigned int output_height, 1.687 +; short *filter 1.688 +;) 1.689 +global sym(vp9_filter_block1d16_h8_sse2) PRIVATE 1.690 +sym(vp9_filter_block1d16_h8_sse2): 1.691 + push rbp 1.692 + mov rbp, rsp 1.693 + SHADOW_ARGS_TO_STACK 6 1.694 + SAVE_XMM 7 1.695 + push rsi 1.696 + push rdi 1.697 + ; end prolog 1.698 + 1.699 + ALIGN_STACK 16, rax 1.700 + sub rsp, 16 * 10 1.701 + %define k0 [rsp + 16 * 0] 1.702 + %define k1 [rsp + 16 * 1] 1.703 + %define k2 [rsp + 16 * 2] 1.704 + %define k3 [rsp + 16 * 3] 1.705 + %define k4 [rsp + 16 * 4] 1.706 + %define k5 [rsp + 16 * 5] 1.707 + %define k6 [rsp + 16 * 6] 1.708 + %define k7 [rsp + 16 * 7] 1.709 + %define krd [rsp + 16 * 8] 1.710 + %define zero [rsp + 16 * 9] 1.711 + 1.712 + GET_FILTERS 1.713 + 1.714 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.715 + movsxd rdx, DWORD PTR arg(3) ;out_pitch 1.716 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.717 + 1.718 +.loop: 1.719 + movdqu xmm0, [rsi - 3] ;load src 1.720 + 1.721 + movdqa xmm1, xmm0 1.722 + movdqa xmm6, xmm0 1.723 + movdqa xmm7, xmm0 1.724 + movdqa xmm2, xmm0 1.725 + movdqa xmm5, xmm0 1.726 + movdqa xmm3, xmm0 1.727 + movdqa xmm4, xmm0 1.728 + 1.729 + psrldq xmm1, 1 1.730 + psrldq xmm6, 6 1.731 + psrldq xmm7, 7 1.732 + psrldq xmm2, 2 1.733 + psrldq xmm5, 5 1.734 + psrldq xmm3, 3 1.735 + psrldq xmm4, 4 1.736 + 1.737 + APPLY_FILTER_8 0, 0 1.738 + 1.739 + movdqu xmm0, [rsi + 5] ;load src 1.740 + 1.741 + movdqa xmm1, xmm0 1.742 + movdqa xmm6, xmm0 1.743 + movdqa xmm7, xmm0 1.744 + movdqa xmm2, xmm0 1.745 + movdqa xmm5, xmm0 1.746 + movdqa xmm3, xmm0 1.747 + movdqa xmm4, xmm0 1.748 + 1.749 + psrldq xmm1, 1 1.750 + psrldq xmm6, 6 1.751 + psrldq xmm7, 7 1.752 + psrldq xmm2, 2 1.753 + psrldq xmm5, 5 1.754 + psrldq xmm3, 3 1.755 + psrldq xmm4, 4 1.756 + 1.757 + APPLY_FILTER_8 0, 8 1.758 + 1.759 + lea rsi, [rsi + rax] 1.760 + lea rdi, [rdi + rdx] 1.761 + dec rcx 1.762 + jnz .loop 1.763 + 1.764 + add rsp, 16 * 10 1.765 + pop rsp 1.766 + 1.767 + ; begin epilog 1.768 + pop rdi 1.769 + pop rsi 1.770 + RESTORE_XMM 1.771 + UNSHADOW_ARGS 1.772 + pop rbp 1.773 + ret 1.774 + 1.775 +global sym(vp9_filter_block1d4_h8_avg_sse2) PRIVATE 1.776 +sym(vp9_filter_block1d4_h8_avg_sse2): 1.777 + push rbp 1.778 + mov rbp, rsp 1.779 + SHADOW_ARGS_TO_STACK 6 1.780 + SAVE_XMM 7 1.781 + push rsi 1.782 + push rdi 1.783 + ; end prolog 1.784 + 1.785 + ALIGN_STACK 16, rax 1.786 + sub rsp, 16 * 6 1.787 + %define k0k1 [rsp + 16 * 0] 1.788 + %define k2k3 [rsp + 16 * 1] 1.789 + %define k5k4 [rsp + 16 * 2] 1.790 + %define k6k7 [rsp + 16 * 3] 1.791 + %define krd [rsp + 16 * 4] 1.792 + %define zero [rsp + 16 * 5] 1.793 + 1.794 + GET_FILTERS_4 1.795 + 1.796 + mov rsi, arg(0) ;src_ptr 1.797 + mov rdi, arg(2) ;output_ptr 1.798 + 1.799 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.800 + movsxd rdx, DWORD PTR arg(3) ;out_pitch 1.801 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.802 + 1.803 +.loop: 1.804 + movdqu xmm0, [rsi - 3] ;load src 1.805 + 1.806 + movdqa xmm1, xmm0 1.807 + movdqa xmm6, xmm0 1.808 + movdqa xmm7, xmm0 1.809 + movdqa xmm2, xmm0 1.810 + movdqa xmm3, xmm0 1.811 + movdqa xmm5, xmm0 1.812 + movdqa xmm4, xmm0 1.813 + 1.814 + psrldq xmm1, 1 1.815 + psrldq xmm6, 6 1.816 + psrldq xmm7, 7 1.817 + psrldq xmm2, 2 1.818 + psrldq xmm3, 3 1.819 + psrldq xmm5, 5 1.820 + psrldq xmm4, 4 1.821 + 1.822 + APPLY_FILTER_4 1 1.823 + 1.824 + lea rsi, [rsi + rax] 1.825 + lea rdi, [rdi + rdx] 1.826 + dec rcx 1.827 + jnz .loop 1.828 + 1.829 + add rsp, 16 * 6 1.830 + pop rsp 1.831 + 1.832 + ; begin epilog 1.833 + pop rdi 1.834 + pop rsi 1.835 + RESTORE_XMM 1.836 + UNSHADOW_ARGS 1.837 + pop rbp 1.838 + ret 1.839 + 1.840 +global sym(vp9_filter_block1d8_h8_avg_sse2) PRIVATE 1.841 +sym(vp9_filter_block1d8_h8_avg_sse2): 1.842 + push rbp 1.843 + mov rbp, rsp 1.844 + SHADOW_ARGS_TO_STACK 6 1.845 + SAVE_XMM 7 1.846 + push rsi 1.847 + push rdi 1.848 + ; end prolog 1.849 + 1.850 + ALIGN_STACK 16, rax 1.851 + sub rsp, 16 * 10 1.852 + %define k0 [rsp + 16 * 0] 1.853 + %define k1 [rsp + 16 * 1] 1.854 + %define k2 [rsp + 16 * 2] 1.855 + %define k3 [rsp + 16 * 3] 1.856 + %define k4 [rsp + 16 * 4] 1.857 + %define k5 [rsp + 16 * 5] 1.858 + %define k6 [rsp + 16 * 6] 1.859 + %define k7 [rsp + 16 * 7] 1.860 + %define krd [rsp + 16 * 8] 1.861 + %define zero [rsp + 16 * 9] 1.862 + 1.863 + GET_FILTERS 1.864 + 1.865 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.866 + movsxd rdx, DWORD PTR arg(3) ;out_pitch 1.867 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.868 + 1.869 +.loop: 1.870 + movdqu xmm0, [rsi - 3] ;load src 1.871 + 1.872 + movdqa xmm1, xmm0 1.873 + movdqa xmm6, xmm0 1.874 + movdqa xmm7, xmm0 1.875 + movdqa xmm2, xmm0 1.876 + movdqa xmm5, xmm0 1.877 + movdqa xmm3, xmm0 1.878 + movdqa xmm4, xmm0 1.879 + 1.880 + psrldq xmm1, 1 1.881 + psrldq xmm6, 6 1.882 + psrldq xmm7, 7 1.883 + psrldq xmm2, 2 1.884 + psrldq xmm5, 5 1.885 + psrldq xmm3, 3 1.886 + psrldq xmm4, 4 1.887 + 1.888 + APPLY_FILTER_8 1, 0 1.889 + 1.890 + lea rsi, [rsi + rax] 1.891 + lea rdi, [rdi + rdx] 1.892 + dec rcx 1.893 + jnz .loop 1.894 + 1.895 + add rsp, 16 * 10 1.896 + pop rsp 1.897 + 1.898 + ; begin epilog 1.899 + pop rdi 1.900 + pop rsi 1.901 + RESTORE_XMM 1.902 + UNSHADOW_ARGS 1.903 + pop rbp 1.904 + ret 1.905 + 1.906 +global sym(vp9_filter_block1d16_h8_avg_sse2) PRIVATE 1.907 +sym(vp9_filter_block1d16_h8_avg_sse2): 1.908 + push rbp 1.909 + mov rbp, rsp 1.910 + SHADOW_ARGS_TO_STACK 6 1.911 + SAVE_XMM 7 1.912 + push rsi 1.913 + push rdi 1.914 + ; end prolog 1.915 + 1.916 + ALIGN_STACK 16, rax 1.917 + sub rsp, 16 * 10 1.918 + %define k0 [rsp + 16 * 0] 1.919 + %define k1 [rsp + 16 * 1] 1.920 + %define k2 [rsp + 16 * 2] 1.921 + %define k3 [rsp + 16 * 3] 1.922 + %define k4 [rsp + 16 * 4] 1.923 + %define k5 [rsp + 16 * 5] 1.924 + %define k6 [rsp + 16 * 6] 1.925 + %define k7 [rsp + 16 * 7] 1.926 + %define krd [rsp + 16 * 8] 1.927 + %define zero [rsp + 16 * 9] 1.928 + 1.929 + GET_FILTERS 1.930 + 1.931 + movsxd rax, DWORD PTR arg(1) ;pixels_per_line 1.932 + movsxd rdx, DWORD PTR arg(3) ;out_pitch 1.933 + movsxd rcx, DWORD PTR arg(4) ;output_height 1.934 + 1.935 +.loop: 1.936 + movdqu xmm0, [rsi - 3] ;load src 1.937 + 1.938 + movdqa xmm1, xmm0 1.939 + movdqa xmm6, xmm0 1.940 + movdqa xmm7, xmm0 1.941 + movdqa xmm2, xmm0 1.942 + movdqa xmm5, xmm0 1.943 + movdqa xmm3, xmm0 1.944 + movdqa xmm4, xmm0 1.945 + 1.946 + psrldq xmm1, 1 1.947 + psrldq xmm6, 6 1.948 + psrldq xmm7, 7 1.949 + psrldq xmm2, 2 1.950 + psrldq xmm5, 5 1.951 + psrldq xmm3, 3 1.952 + psrldq xmm4, 4 1.953 + 1.954 + APPLY_FILTER_8 1, 0 1.955 + 1.956 + movdqu xmm0, [rsi + 5] ;load src 1.957 + 1.958 + movdqa xmm1, xmm0 1.959 + movdqa xmm6, xmm0 1.960 + movdqa xmm7, xmm0 1.961 + movdqa xmm2, xmm0 1.962 + movdqa xmm5, xmm0 1.963 + movdqa xmm3, xmm0 1.964 + movdqa xmm4, xmm0 1.965 + 1.966 + psrldq xmm1, 1 1.967 + psrldq xmm6, 6 1.968 + psrldq xmm7, 7 1.969 + psrldq xmm2, 2 1.970 + psrldq xmm5, 5 1.971 + psrldq xmm3, 3 1.972 + psrldq xmm4, 4 1.973 + 1.974 + APPLY_FILTER_8 1, 8 1.975 + 1.976 + lea rsi, [rsi + rax] 1.977 + lea rdi, [rdi + rdx] 1.978 + dec rcx 1.979 + jnz .loop 1.980 + 1.981 + add rsp, 16 * 10 1.982 + pop rsp 1.983 + 1.984 + ; begin epilog 1.985 + pop rdi 1.986 + pop rsi 1.987 + RESTORE_XMM 1.988 + UNSHADOW_ARGS 1.989 + pop rbp 1.990 + ret