1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_subpel_variance_impl_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,337 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "vpx_ports/x86_abi_support.asm" 1.15 + 1.16 +;void vp9_half_horiz_vert_variance16x_h_sse2 1.17 +;( 1.18 +; unsigned char *ref_ptr, 1.19 +; int ref_pixels_per_line, 1.20 +; unsigned char *src_ptr, 1.21 +; int src_pixels_per_line, 1.22 +; unsigned int Height, 1.23 +; int *sum, 1.24 +; unsigned int *sumsquared 1.25 +;) 1.26 +global sym(vp9_half_horiz_vert_variance16x_h_sse2) PRIVATE 1.27 +sym(vp9_half_horiz_vert_variance16x_h_sse2): 1.28 + push rbp 1.29 + mov rbp, rsp 1.30 + SHADOW_ARGS_TO_STACK 7 1.31 + SAVE_XMM 7 1.32 + GET_GOT rbx 1.33 + push rsi 1.34 + push rdi 1.35 + ; end prolog 1.36 + 1.37 + pxor xmm6, xmm6 ; error accumulator 1.38 + pxor xmm7, xmm7 ; sse eaccumulator 1.39 + mov rsi, arg(0) ;ref_ptr ; 1.40 + 1.41 + mov rdi, arg(2) ;src_ptr ; 1.42 + movsxd rcx, dword ptr arg(4) ;Height ; 1.43 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.44 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.45 + 1.46 + pxor xmm0, xmm0 ; 1.47 + 1.48 + movdqu xmm5, XMMWORD PTR [rsi] 1.49 + movdqu xmm3, XMMWORD PTR [rsi+1] 1.50 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) horizontal line 1 1.51 + 1.52 + lea rsi, [rsi + rax] 1.53 + 1.54 +.half_horiz_vert_variance16x_h_1: 1.55 + movdqu xmm1, XMMWORD PTR [rsi] ; 1.56 + movdqu xmm2, XMMWORD PTR [rsi+1] ; 1.57 + pavgb xmm1, xmm2 ; xmm1 = avg(xmm1,xmm3) horizontal line i+1 1.58 + 1.59 + pavgb xmm5, xmm1 ; xmm = vertical average of the above 1.60 + 1.61 + movdqa xmm4, xmm5 1.62 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.63 + punpckhbw xmm4, xmm0 1.64 + 1.65 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 1.66 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.67 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.68 + 1.69 + movq xmm3, QWORD PTR [rdi+8] 1.70 + punpcklbw xmm3, xmm0 1.71 + psubw xmm4, xmm3 1.72 + 1.73 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.74 + paddw xmm6, xmm4 1.75 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.76 + pmaddwd xmm4, xmm4 1.77 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.78 + paddd xmm7, xmm4 1.79 + 1.80 + movdqa xmm5, xmm1 ; save xmm1 for use on the next row 1.81 + 1.82 + lea rsi, [rsi + rax] 1.83 + lea rdi, [rdi + rdx] 1.84 + 1.85 + sub rcx, 1 ; 1.86 + jnz .half_horiz_vert_variance16x_h_1 ; 1.87 + 1.88 + pxor xmm1, xmm1 1.89 + pxor xmm5, xmm5 1.90 + 1.91 + punpcklwd xmm0, xmm6 1.92 + punpckhwd xmm1, xmm6 1.93 + psrad xmm0, 16 1.94 + psrad xmm1, 16 1.95 + paddd xmm0, xmm1 1.96 + movdqa xmm1, xmm0 1.97 + 1.98 + movdqa xmm6, xmm7 1.99 + punpckldq xmm6, xmm5 1.100 + punpckhdq xmm7, xmm5 1.101 + paddd xmm6, xmm7 1.102 + 1.103 + punpckldq xmm0, xmm5 1.104 + punpckhdq xmm1, xmm5 1.105 + paddd xmm0, xmm1 1.106 + 1.107 + movdqa xmm7, xmm6 1.108 + movdqa xmm1, xmm0 1.109 + 1.110 + psrldq xmm7, 8 1.111 + psrldq xmm1, 8 1.112 + 1.113 + paddd xmm6, xmm7 1.114 + paddd xmm0, xmm1 1.115 + 1.116 + mov rsi, arg(5) ;[Sum] 1.117 + mov rdi, arg(6) ;[SSE] 1.118 + 1.119 + movd [rsi], xmm0 1.120 + movd [rdi], xmm6 1.121 + 1.122 + ; begin epilog 1.123 + pop rdi 1.124 + pop rsi 1.125 + RESTORE_GOT 1.126 + RESTORE_XMM 1.127 + UNSHADOW_ARGS 1.128 + pop rbp 1.129 + ret 1.130 + 1.131 +;void vp9_half_vert_variance16x_h_sse2 1.132 +;( 1.133 +; unsigned char *ref_ptr, 1.134 +; int ref_pixels_per_line, 1.135 +; unsigned char *src_ptr, 1.136 +; int src_pixels_per_line, 1.137 +; unsigned int Height, 1.138 +; int *sum, 1.139 +; unsigned int *sumsquared 1.140 +;) 1.141 +global sym(vp9_half_vert_variance16x_h_sse2) PRIVATE 1.142 +sym(vp9_half_vert_variance16x_h_sse2): 1.143 + push rbp 1.144 + mov rbp, rsp 1.145 + SHADOW_ARGS_TO_STACK 7 1.146 + SAVE_XMM 7 1.147 + GET_GOT rbx 1.148 + push rsi 1.149 + push rdi 1.150 + ; end prolog 1.151 + 1.152 + pxor xmm6, xmm6 ; error accumulator 1.153 + pxor xmm7, xmm7 ; sse eaccumulator 1.154 + mov rsi, arg(0) ;ref_ptr 1.155 + 1.156 + mov rdi, arg(2) ;src_ptr 1.157 + movsxd rcx, dword ptr arg(4) ;Height 1.158 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.159 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.160 + 1.161 + movdqu xmm5, XMMWORD PTR [rsi] 1.162 + lea rsi, [rsi + rax ] 1.163 + pxor xmm0, xmm0 1.164 + 1.165 +.half_vert_variance16x_h_1: 1.166 + movdqu xmm3, XMMWORD PTR [rsi] 1.167 + 1.168 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.169 + movdqa xmm4, xmm5 1.170 + punpcklbw xmm5, xmm0 1.171 + punpckhbw xmm4, xmm0 1.172 + 1.173 + movq xmm2, QWORD PTR [rdi] 1.174 + punpcklbw xmm2, xmm0 1.175 + psubw xmm5, xmm2 1.176 + movq xmm2, QWORD PTR [rdi+8] 1.177 + punpcklbw xmm2, xmm0 1.178 + psubw xmm4, xmm2 1.179 + 1.180 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.181 + paddw xmm6, xmm4 1.182 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.183 + pmaddwd xmm4, xmm4 1.184 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.185 + paddd xmm7, xmm4 1.186 + 1.187 + movdqa xmm5, xmm3 1.188 + 1.189 + lea rsi, [rsi + rax] 1.190 + lea rdi, [rdi + rdx] 1.191 + 1.192 + sub rcx, 1 1.193 + jnz .half_vert_variance16x_h_1 1.194 + 1.195 + pxor xmm1, xmm1 1.196 + pxor xmm5, xmm5 1.197 + 1.198 + punpcklwd xmm0, xmm6 1.199 + punpckhwd xmm1, xmm6 1.200 + psrad xmm0, 16 1.201 + psrad xmm1, 16 1.202 + paddd xmm0, xmm1 1.203 + movdqa xmm1, xmm0 1.204 + 1.205 + movdqa xmm6, xmm7 1.206 + punpckldq xmm6, xmm5 1.207 + punpckhdq xmm7, xmm5 1.208 + paddd xmm6, xmm7 1.209 + 1.210 + punpckldq xmm0, xmm5 1.211 + punpckhdq xmm1, xmm5 1.212 + paddd xmm0, xmm1 1.213 + 1.214 + movdqa xmm7, xmm6 1.215 + movdqa xmm1, xmm0 1.216 + 1.217 + psrldq xmm7, 8 1.218 + psrldq xmm1, 8 1.219 + 1.220 + paddd xmm6, xmm7 1.221 + paddd xmm0, xmm1 1.222 + 1.223 + mov rsi, arg(5) ;[Sum] 1.224 + mov rdi, arg(6) ;[SSE] 1.225 + 1.226 + movd [rsi], xmm0 1.227 + movd [rdi], xmm6 1.228 + 1.229 + ; begin epilog 1.230 + pop rdi 1.231 + pop rsi 1.232 + RESTORE_GOT 1.233 + RESTORE_XMM 1.234 + UNSHADOW_ARGS 1.235 + pop rbp 1.236 + ret 1.237 + 1.238 +;void vp9_half_horiz_variance16x_h_sse2 1.239 +;( 1.240 +; unsigned char *ref_ptr, 1.241 +; int ref_pixels_per_line, 1.242 +; unsigned char *src_ptr, 1.243 +; int src_pixels_per_line, 1.244 +; unsigned int Height, 1.245 +; int *sum, 1.246 +; unsigned int *sumsquared 1.247 +;) 1.248 +global sym(vp9_half_horiz_variance16x_h_sse2) PRIVATE 1.249 +sym(vp9_half_horiz_variance16x_h_sse2): 1.250 + push rbp 1.251 + mov rbp, rsp 1.252 + SHADOW_ARGS_TO_STACK 7 1.253 + SAVE_XMM 7 1.254 + GET_GOT rbx 1.255 + push rsi 1.256 + push rdi 1.257 + ; end prolog 1.258 + 1.259 + pxor xmm6, xmm6 ; error accumulator 1.260 + pxor xmm7, xmm7 ; sse eaccumulator 1.261 + mov rsi, arg(0) ;ref_ptr ; 1.262 + 1.263 + mov rdi, arg(2) ;src_ptr ; 1.264 + movsxd rcx, dword ptr arg(4) ;Height ; 1.265 + movsxd rax, dword ptr arg(1) ;ref_pixels_per_line 1.266 + movsxd rdx, dword ptr arg(3) ;src_pixels_per_line 1.267 + 1.268 + pxor xmm0, xmm0 ; 1.269 + 1.270 +.half_horiz_variance16x_h_1: 1.271 + movdqu xmm5, XMMWORD PTR [rsi] ; xmm5 = s0,s1,s2..s15 1.272 + movdqu xmm3, XMMWORD PTR [rsi+1] ; xmm3 = s1,s2,s3..s16 1.273 + 1.274 + pavgb xmm5, xmm3 ; xmm5 = avg(xmm1,xmm3) 1.275 + movdqa xmm1, xmm5 1.276 + punpcklbw xmm5, xmm0 ; xmm5 = words of above 1.277 + punpckhbw xmm1, xmm0 1.278 + 1.279 + movq xmm3, QWORD PTR [rdi] ; xmm3 = d0,d1,d2..d7 1.280 + punpcklbw xmm3, xmm0 ; xmm3 = words of above 1.281 + movq xmm2, QWORD PTR [rdi+8] 1.282 + punpcklbw xmm2, xmm0 1.283 + 1.284 + psubw xmm5, xmm3 ; xmm5 -= xmm3 1.285 + psubw xmm1, xmm2 1.286 + paddw xmm6, xmm5 ; xmm6 += accumulated column differences 1.287 + paddw xmm6, xmm1 1.288 + pmaddwd xmm5, xmm5 ; xmm5 *= xmm5 1.289 + pmaddwd xmm1, xmm1 1.290 + paddd xmm7, xmm5 ; xmm7 += accumulated square column differences 1.291 + paddd xmm7, xmm1 1.292 + 1.293 + lea rsi, [rsi + rax] 1.294 + lea rdi, [rdi + rdx] 1.295 + 1.296 + sub rcx, 1 ; 1.297 + jnz .half_horiz_variance16x_h_1 ; 1.298 + 1.299 + pxor xmm1, xmm1 1.300 + pxor xmm5, xmm5 1.301 + 1.302 + punpcklwd xmm0, xmm6 1.303 + punpckhwd xmm1, xmm6 1.304 + psrad xmm0, 16 1.305 + psrad xmm1, 16 1.306 + paddd xmm0, xmm1 1.307 + movdqa xmm1, xmm0 1.308 + 1.309 + movdqa xmm6, xmm7 1.310 + punpckldq xmm6, xmm5 1.311 + punpckhdq xmm7, xmm5 1.312 + paddd xmm6, xmm7 1.313 + 1.314 + punpckldq xmm0, xmm5 1.315 + punpckhdq xmm1, xmm5 1.316 + paddd xmm0, xmm1 1.317 + 1.318 + movdqa xmm7, xmm6 1.319 + movdqa xmm1, xmm0 1.320 + 1.321 + psrldq xmm7, 8 1.322 + psrldq xmm1, 8 1.323 + 1.324 + paddd xmm6, xmm7 1.325 + paddd xmm0, xmm1 1.326 + 1.327 + mov rsi, arg(5) ;[Sum] 1.328 + mov rdi, arg(6) ;[SSE] 1.329 + 1.330 + movd [rsi], xmm0 1.331 + movd [rdi], xmm6 1.332 + 1.333 + ; begin epilog 1.334 + pop rdi 1.335 + pop rsi 1.336 + RESTORE_GOT 1.337 + RESTORE_XMM 1.338 + UNSHADOW_ARGS 1.339 + pop rbp 1.340 + ret