1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/variance_impl_ssse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,364 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%define xmm_filter_shift 7 1.18 + 1.19 + 1.20 +;void vp8_filter_block2d_bil_var_ssse3 1.21 +;( 1.22 +; unsigned char *ref_ptr, 1.23 +; int ref_pixels_per_line, 1.24 +; unsigned char *src_ptr, 1.25 +; int src_pixels_per_line, 1.26 +; unsigned int Height, 1.27 +; int xoffset, 1.28 +; int yoffset, 1.29 +; int *sum, 1.30 +; unsigned int *sumsquared;; 1.31 +; 1.32 +;) 1.33 +;Note: The filter coefficient at offset=0 is 128. Since the second register 1.34 +;for Pmaddubsw is signed bytes, we must calculate zero offset seperately. 1.35 +global sym(vp8_filter_block2d_bil_var_ssse3) PRIVATE 1.36 +sym(vp8_filter_block2d_bil_var_ssse3): 1.37 + push rbp 1.38 + mov rbp, rsp 1.39 + SHADOW_ARGS_TO_STACK 9 1.40 + SAVE_XMM 7 1.41 + GET_GOT rbx 1.42 + push rsi 1.43 + push rdi 1.44 + ; end prolog 1.45 + 1.46 + pxor xmm6, xmm6 1.47 + pxor xmm7, xmm7 1.48 + 1.49 + lea rcx, [GLOBAL(vp8_bilinear_filters_ssse3)] 1.50 + movsxd rax, dword ptr arg(5) ; xoffset 1.51 + 1.52 + cmp rax, 0 ; skip first_pass filter if xoffset=0 1.53 + je .filter_block2d_bil_var_ssse3_sp_only 1.54 + 1.55 + shl rax, 4 ; point to filter coeff with xoffset 1.56 + lea rax, [rax + rcx] ; HFilter 1.57 + 1.58 + movsxd rdx, dword ptr arg(6) ; yoffset 1.59 + 1.60 + cmp rdx, 0 ; skip second_pass filter if yoffset=0 1.61 + je .filter_block2d_bil_var_ssse3_fp_only 1.62 + 1.63 + shl rdx, 4 1.64 + lea rdx, [rdx + rcx] ; VFilter 1.65 + 1.66 + mov rsi, arg(0) ;ref_ptr 1.67 + mov rdi, arg(2) ;src_ptr 1.68 + movsxd rcx, dword ptr arg(4) ;Height 1.69 + 1.70 + movdqu xmm0, XMMWORD PTR [rsi] 1.71 + movdqu xmm1, XMMWORD PTR [rsi+1] 1.72 + movdqa xmm2, xmm0 1.73 + 1.74 + punpcklbw xmm0, xmm1 1.75 + punpckhbw xmm2, xmm1 1.76 + pmaddubsw xmm0, [rax] 1.77 + pmaddubsw xmm2, [rax] 1.78 + 1.79 + paddw xmm0, [GLOBAL(xmm_bi_rd)] 1.80 + paddw xmm2, [GLOBAL(xmm_bi_rd)] 1.81 + psraw xmm0, xmm_filter_shift 1.82 + psraw xmm2, xmm_filter_shift 1.83 + 1.84 + packuswb xmm0, xmm2 1.85 + 1.86 +%if ABI_IS_32BIT 1.87 + add rsi, dword ptr arg(1) ;ref_pixels_per_line 1.88 +%else 1.89 + movsxd r8, dword ptr arg(1) ;ref_pixels_per_line 1.90 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.91 + lea rsi, [rsi + r8] 1.92 +%endif 1.93 + 1.94 +.filter_block2d_bil_var_ssse3_loop: 1.95 + movdqu xmm1, XMMWORD PTR [rsi] 1.96 + movdqu xmm2, XMMWORD PTR [rsi+1] 1.97 + movdqa xmm3, xmm1 1.98 + 1.99 + punpcklbw xmm1, xmm2 1.100 + punpckhbw xmm3, xmm2 1.101 + pmaddubsw xmm1, [rax] 1.102 + pmaddubsw xmm3, [rax] 1.103 + 1.104 + paddw xmm1, [GLOBAL(xmm_bi_rd)] 1.105 + paddw xmm3, [GLOBAL(xmm_bi_rd)] 1.106 + psraw xmm1, xmm_filter_shift 1.107 + psraw xmm3, xmm_filter_shift 1.108 + packuswb xmm1, xmm3 1.109 + 1.110 + movdqa xmm2, xmm0 1.111 + movdqa xmm0, xmm1 1.112 + movdqa xmm3, xmm2 1.113 + 1.114 + punpcklbw xmm2, xmm1 1.115 + punpckhbw xmm3, xmm1 1.116 + pmaddubsw xmm2, [rdx] 1.117 + pmaddubsw xmm3, [rdx] 1.118 + 1.119 + paddw xmm2, [GLOBAL(xmm_bi_rd)] 1.120 + paddw xmm3, [GLOBAL(xmm_bi_rd)] 1.121 + psraw xmm2, xmm_filter_shift 1.122 + psraw xmm3, xmm_filter_shift 1.123 + 1.124 + movq xmm1, QWORD PTR [rdi] 1.125 + pxor xmm4, xmm4 1.126 + punpcklbw xmm1, xmm4 1.127 + movq xmm5, QWORD PTR [rdi+8] 1.128 + punpcklbw xmm5, xmm4 1.129 + 1.130 + psubw xmm2, xmm1 1.131 + psubw xmm3, xmm5 1.132 + paddw xmm6, xmm2 1.133 + paddw xmm6, xmm3 1.134 + pmaddwd xmm2, xmm2 1.135 + pmaddwd xmm3, xmm3 1.136 + paddd xmm7, xmm2 1.137 + paddd xmm7, xmm3 1.138 + 1.139 +%if ABI_IS_32BIT 1.140 + add rsi, dword ptr arg(1) ;ref_pixels_per_line 1.141 + add rdi, dword ptr arg(3) ;src_pixels_per_line 1.142 +%else 1.143 + lea rsi, [rsi + r8] 1.144 + lea rdi, [rdi + r9] 1.145 +%endif 1.146 + 1.147 + sub rcx, 1 1.148 + jnz .filter_block2d_bil_var_ssse3_loop 1.149 + 1.150 + jmp .filter_block2d_bil_variance 1.151 + 1.152 +.filter_block2d_bil_var_ssse3_sp_only: 1.153 + movsxd rdx, dword ptr arg(6) ; yoffset 1.154 + 1.155 + cmp rdx, 0 ; Both xoffset =0 and yoffset=0 1.156 + je .filter_block2d_bil_var_ssse3_full_pixel 1.157 + 1.158 + shl rdx, 4 1.159 + lea rdx, [rdx + rcx] ; VFilter 1.160 + 1.161 + mov rsi, arg(0) ;ref_ptr 1.162 + mov rdi, arg(2) ;src_ptr 1.163 + movsxd rcx, dword ptr arg(4) ;Height 1.164 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.165 + 1.166 + movdqu xmm1, XMMWORD PTR [rsi] 1.167 + movdqa xmm0, xmm1 1.168 + 1.169 +%if ABI_IS_32BIT=0 1.170 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.171 +%endif 1.172 + 1.173 + lea rsi, [rsi + rax] 1.174 + 1.175 +.filter_block2d_bil_sp_only_loop: 1.176 + movdqu xmm3, XMMWORD PTR [rsi] 1.177 + movdqa xmm2, xmm1 1.178 + movdqa xmm0, xmm3 1.179 + 1.180 + punpcklbw xmm1, xmm3 1.181 + punpckhbw xmm2, xmm3 1.182 + pmaddubsw xmm1, [rdx] 1.183 + pmaddubsw xmm2, [rdx] 1.184 + 1.185 + paddw xmm1, [GLOBAL(xmm_bi_rd)] 1.186 + paddw xmm2, [GLOBAL(xmm_bi_rd)] 1.187 + psraw xmm1, xmm_filter_shift 1.188 + psraw xmm2, xmm_filter_shift 1.189 + 1.190 + movq xmm3, QWORD PTR [rdi] 1.191 + pxor xmm4, xmm4 1.192 + punpcklbw xmm3, xmm4 1.193 + movq xmm5, QWORD PTR [rdi+8] 1.194 + punpcklbw xmm5, xmm4 1.195 + 1.196 + psubw xmm1, xmm3 1.197 + psubw xmm2, xmm5 1.198 + paddw xmm6, xmm1 1.199 + paddw xmm6, xmm2 1.200 + pmaddwd xmm1, xmm1 1.201 + pmaddwd xmm2, xmm2 1.202 + paddd xmm7, xmm1 1.203 + paddd xmm7, xmm2 1.204 + 1.205 + movdqa xmm1, xmm0 1.206 + lea rsi, [rsi + rax] ;ref_pixels_per_line 1.207 + 1.208 +%if ABI_IS_32BIT 1.209 + add rdi, dword ptr arg(3) ;src_pixels_per_line 1.210 +%else 1.211 + lea rdi, [rdi + r9] 1.212 +%endif 1.213 + 1.214 + sub rcx, 1 1.215 + jnz .filter_block2d_bil_sp_only_loop 1.216 + 1.217 + jmp .filter_block2d_bil_variance 1.218 + 1.219 +.filter_block2d_bil_var_ssse3_full_pixel: 1.220 + mov rsi, arg(0) ;ref_ptr 1.221 + mov rdi, arg(2) ;src_ptr 1.222 + movsxd rcx, dword ptr arg(4) ;Height 1.223 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.224 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.225 + pxor xmm0, xmm0 1.226 + 1.227 +.filter_block2d_bil_full_pixel_loop: 1.228 + movq xmm1, QWORD PTR [rsi] 1.229 + punpcklbw xmm1, xmm0 1.230 + movq xmm2, QWORD PTR [rsi+8] 1.231 + punpcklbw xmm2, xmm0 1.232 + 1.233 + movq xmm3, QWORD PTR [rdi] 1.234 + punpcklbw xmm3, xmm0 1.235 + movq xmm4, QWORD PTR [rdi+8] 1.236 + punpcklbw xmm4, xmm0 1.237 + 1.238 + psubw xmm1, xmm3 1.239 + psubw xmm2, xmm4 1.240 + paddw xmm6, xmm1 1.241 + paddw xmm6, xmm2 1.242 + pmaddwd xmm1, xmm1 1.243 + pmaddwd xmm2, xmm2 1.244 + paddd xmm7, xmm1 1.245 + paddd xmm7, xmm2 1.246 + 1.247 + lea rsi, [rsi + rax] ;ref_pixels_per_line 1.248 + lea rdi, [rdi + rdx] ;src_pixels_per_line 1.249 + sub rcx, 1 1.250 + jnz .filter_block2d_bil_full_pixel_loop 1.251 + 1.252 + jmp .filter_block2d_bil_variance 1.253 + 1.254 +.filter_block2d_bil_var_ssse3_fp_only: 1.255 + mov rsi, arg(0) ;ref_ptr 1.256 + mov rdi, arg(2) ;src_ptr 1.257 + movsxd rcx, dword ptr arg(4) ;Height 1.258 + movsxd rdx, dword ptr arg(1) ;ref_pixels_per_line 1.259 + 1.260 + pxor xmm0, xmm0 1.261 + 1.262 +%if ABI_IS_32BIT=0 1.263 + movsxd r9, dword ptr arg(3) ;src_pixels_per_line 1.264 +%endif 1.265 + 1.266 +.filter_block2d_bil_fp_only_loop: 1.267 + movdqu xmm1, XMMWORD PTR [rsi] 1.268 + movdqu xmm2, XMMWORD PTR [rsi+1] 1.269 + movdqa xmm3, xmm1 1.270 + 1.271 + punpcklbw xmm1, xmm2 1.272 + punpckhbw xmm3, xmm2 1.273 + pmaddubsw xmm1, [rax] 1.274 + pmaddubsw xmm3, [rax] 1.275 + 1.276 + paddw xmm1, [GLOBAL(xmm_bi_rd)] 1.277 + paddw xmm3, [GLOBAL(xmm_bi_rd)] 1.278 + psraw xmm1, xmm_filter_shift 1.279 + psraw xmm3, xmm_filter_shift 1.280 + 1.281 + movq xmm2, XMMWORD PTR [rdi] 1.282 + pxor xmm4, xmm4 1.283 + punpcklbw xmm2, xmm4 1.284 + movq xmm5, QWORD PTR [rdi+8] 1.285 + punpcklbw xmm5, xmm4 1.286 + 1.287 + psubw xmm1, xmm2 1.288 + psubw xmm3, xmm5 1.289 + paddw xmm6, xmm1 1.290 + paddw xmm6, xmm3 1.291 + pmaddwd xmm1, xmm1 1.292 + pmaddwd xmm3, xmm3 1.293 + paddd xmm7, xmm1 1.294 + paddd xmm7, xmm3 1.295 + 1.296 + lea rsi, [rsi + rdx] 1.297 +%if ABI_IS_32BIT 1.298 + add rdi, dword ptr arg(3) ;src_pixels_per_line 1.299 +%else 1.300 + lea rdi, [rdi + r9] 1.301 +%endif 1.302 + 1.303 + sub rcx, 1 1.304 + jnz .filter_block2d_bil_fp_only_loop 1.305 + 1.306 + jmp .filter_block2d_bil_variance 1.307 + 1.308 +.filter_block2d_bil_variance: 1.309 + pxor xmm0, xmm0 1.310 + pxor xmm1, xmm1 1.311 + pxor xmm5, xmm5 1.312 + 1.313 + punpcklwd xmm0, xmm6 1.314 + punpckhwd xmm1, xmm6 1.315 + psrad xmm0, 16 1.316 + psrad xmm1, 16 1.317 + paddd xmm0, xmm1 1.318 + movdqa xmm1, xmm0 1.319 + 1.320 + movdqa xmm6, xmm7 1.321 + punpckldq xmm6, xmm5 1.322 + punpckhdq xmm7, xmm5 1.323 + paddd xmm6, xmm7 1.324 + 1.325 + punpckldq xmm0, xmm5 1.326 + punpckhdq xmm1, xmm5 1.327 + paddd xmm0, xmm1 1.328 + 1.329 + movdqa xmm7, xmm6 1.330 + movdqa xmm1, xmm0 1.331 + 1.332 + psrldq xmm7, 8 1.333 + psrldq xmm1, 8 1.334 + 1.335 + paddd xmm6, xmm7 1.336 + paddd xmm0, xmm1 1.337 + 1.338 + mov rsi, arg(7) ;[Sum] 1.339 + mov rdi, arg(8) ;[SSE] 1.340 + 1.341 + movd [rsi], xmm0 1.342 + movd [rdi], xmm6 1.343 + 1.344 + ; begin epilog 1.345 + pop rdi 1.346 + pop rsi 1.347 + RESTORE_GOT 1.348 + RESTORE_XMM 1.349 + UNSHADOW_ARGS 1.350 + pop rbp 1.351 + ret 1.352 + 1.353 + 1.354 +SECTION_RODATA 1.355 +align 16 1.356 +xmm_bi_rd: 1.357 + times 8 dw 64 1.358 +align 16 1.359 +vp8_bilinear_filters_ssse3: 1.360 + times 8 db 128, 0 1.361 + times 8 db 112, 16 1.362 + times 8 db 96, 32 1.363 + times 8 db 80, 48 1.364 + times 8 db 64, 64 1.365 + times 8 db 48, 80 1.366 + times 8 db 32, 96 1.367 + times 8 db 16, 112