1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/sad_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,410 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;unsigned int vp8_sad16x16_wmt( 1.18 +; unsigned char *src_ptr, 1.19 +; int src_stride, 1.20 +; unsigned char *ref_ptr, 1.21 +; int ref_stride) 1.22 +global sym(vp8_sad16x16_wmt) PRIVATE 1.23 +sym(vp8_sad16x16_wmt): 1.24 + push rbp 1.25 + mov rbp, rsp 1.26 + SHADOW_ARGS_TO_STACK 4 1.27 + SAVE_XMM 6 1.28 + push rsi 1.29 + push rdi 1.30 + ; end prolog 1.31 + 1.32 + mov rsi, arg(0) ;src_ptr 1.33 + mov rdi, arg(2) ;ref_ptr 1.34 + 1.35 + movsxd rax, dword ptr arg(1) ;src_stride 1.36 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.37 + 1.38 + lea rcx, [rsi+rax*8] 1.39 + 1.40 + lea rcx, [rcx+rax*8] 1.41 + pxor xmm6, xmm6 1.42 + 1.43 +.x16x16sad_wmt_loop: 1.44 + 1.45 + movq xmm0, QWORD PTR [rsi] 1.46 + movq xmm2, QWORD PTR [rsi+8] 1.47 + 1.48 + movq xmm1, QWORD PTR [rdi] 1.49 + movq xmm3, QWORD PTR [rdi+8] 1.50 + 1.51 + movq xmm4, QWORD PTR [rsi+rax] 1.52 + movq xmm5, QWORD PTR [rdi+rdx] 1.53 + 1.54 + 1.55 + punpcklbw xmm0, xmm2 1.56 + punpcklbw xmm1, xmm3 1.57 + 1.58 + psadbw xmm0, xmm1 1.59 + movq xmm2, QWORD PTR [rsi+rax+8] 1.60 + 1.61 + movq xmm3, QWORD PTR [rdi+rdx+8] 1.62 + lea rsi, [rsi+rax*2] 1.63 + 1.64 + lea rdi, [rdi+rdx*2] 1.65 + punpcklbw xmm4, xmm2 1.66 + 1.67 + punpcklbw xmm5, xmm3 1.68 + psadbw xmm4, xmm5 1.69 + 1.70 + paddw xmm6, xmm0 1.71 + paddw xmm6, xmm4 1.72 + 1.73 + cmp rsi, rcx 1.74 + jne .x16x16sad_wmt_loop 1.75 + 1.76 + movq xmm0, xmm6 1.77 + psrldq xmm6, 8 1.78 + 1.79 + paddw xmm0, xmm6 1.80 + movq rax, xmm0 1.81 + 1.82 + ; begin epilog 1.83 + pop rdi 1.84 + pop rsi 1.85 + RESTORE_XMM 1.86 + UNSHADOW_ARGS 1.87 + pop rbp 1.88 + ret 1.89 + 1.90 +;unsigned int vp8_sad8x16_wmt( 1.91 +; unsigned char *src_ptr, 1.92 +; int src_stride, 1.93 +; unsigned char *ref_ptr, 1.94 +; int ref_stride, 1.95 +; int max_sad) 1.96 +global sym(vp8_sad8x16_wmt) PRIVATE 1.97 +sym(vp8_sad8x16_wmt): 1.98 + push rbp 1.99 + mov rbp, rsp 1.100 + SHADOW_ARGS_TO_STACK 5 1.101 + push rbx 1.102 + push rsi 1.103 + push rdi 1.104 + ; end prolog 1.105 + 1.106 + mov rsi, arg(0) ;src_ptr 1.107 + mov rdi, arg(2) ;ref_ptr 1.108 + 1.109 + movsxd rbx, dword ptr arg(1) ;src_stride 1.110 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.111 + 1.112 + lea rcx, [rsi+rbx*8] 1.113 + 1.114 + lea rcx, [rcx+rbx*8] 1.115 + pxor mm7, mm7 1.116 + 1.117 +.x8x16sad_wmt_loop: 1.118 + 1.119 + movq rax, mm7 1.120 + cmp eax, arg(4) 1.121 + ja .x8x16sad_wmt_early_exit 1.122 + 1.123 + movq mm0, QWORD PTR [rsi] 1.124 + movq mm1, QWORD PTR [rdi] 1.125 + 1.126 + movq mm2, QWORD PTR [rsi+rbx] 1.127 + movq mm3, QWORD PTR [rdi+rdx] 1.128 + 1.129 + psadbw mm0, mm1 1.130 + psadbw mm2, mm3 1.131 + 1.132 + lea rsi, [rsi+rbx*2] 1.133 + lea rdi, [rdi+rdx*2] 1.134 + 1.135 + paddw mm7, mm0 1.136 + paddw mm7, mm2 1.137 + 1.138 + cmp rsi, rcx 1.139 + jne .x8x16sad_wmt_loop 1.140 + 1.141 + movq rax, mm7 1.142 + 1.143 +.x8x16sad_wmt_early_exit: 1.144 + 1.145 + ; begin epilog 1.146 + pop rdi 1.147 + pop rsi 1.148 + pop rbx 1.149 + UNSHADOW_ARGS 1.150 + pop rbp 1.151 + ret 1.152 + 1.153 + 1.154 +;unsigned int vp8_sad8x8_wmt( 1.155 +; unsigned char *src_ptr, 1.156 +; int src_stride, 1.157 +; unsigned char *ref_ptr, 1.158 +; int ref_stride) 1.159 +global sym(vp8_sad8x8_wmt) PRIVATE 1.160 +sym(vp8_sad8x8_wmt): 1.161 + push rbp 1.162 + mov rbp, rsp 1.163 + SHADOW_ARGS_TO_STACK 5 1.164 + push rbx 1.165 + push rsi 1.166 + push rdi 1.167 + ; end prolog 1.168 + 1.169 + mov rsi, arg(0) ;src_ptr 1.170 + mov rdi, arg(2) ;ref_ptr 1.171 + 1.172 + movsxd rbx, dword ptr arg(1) ;src_stride 1.173 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.174 + 1.175 + lea rcx, [rsi+rbx*8] 1.176 + pxor mm7, mm7 1.177 + 1.178 +.x8x8sad_wmt_loop: 1.179 + 1.180 + movq rax, mm7 1.181 + cmp eax, arg(4) 1.182 + ja .x8x8sad_wmt_early_exit 1.183 + 1.184 + movq mm0, QWORD PTR [rsi] 1.185 + movq mm1, QWORD PTR [rdi] 1.186 + 1.187 + psadbw mm0, mm1 1.188 + lea rsi, [rsi+rbx] 1.189 + 1.190 + add rdi, rdx 1.191 + paddw mm7, mm0 1.192 + 1.193 + cmp rsi, rcx 1.194 + jne .x8x8sad_wmt_loop 1.195 + 1.196 + movq rax, mm7 1.197 +.x8x8sad_wmt_early_exit: 1.198 + 1.199 + ; begin epilog 1.200 + pop rdi 1.201 + pop rsi 1.202 + pop rbx 1.203 + UNSHADOW_ARGS 1.204 + pop rbp 1.205 + ret 1.206 + 1.207 +;unsigned int vp8_sad4x4_wmt( 1.208 +; unsigned char *src_ptr, 1.209 +; int src_stride, 1.210 +; unsigned char *ref_ptr, 1.211 +; int ref_stride) 1.212 +global sym(vp8_sad4x4_wmt) PRIVATE 1.213 +sym(vp8_sad4x4_wmt): 1.214 + push rbp 1.215 + mov rbp, rsp 1.216 + SHADOW_ARGS_TO_STACK 4 1.217 + push rsi 1.218 + push rdi 1.219 + ; end prolog 1.220 + 1.221 + mov rsi, arg(0) ;src_ptr 1.222 + mov rdi, arg(2) ;ref_ptr 1.223 + 1.224 + movsxd rax, dword ptr arg(1) ;src_stride 1.225 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.226 + 1.227 + movd mm0, DWORD PTR [rsi] 1.228 + movd mm1, DWORD PTR [rdi] 1.229 + 1.230 + movd mm2, DWORD PTR [rsi+rax] 1.231 + movd mm3, DWORD PTR [rdi+rdx] 1.232 + 1.233 + punpcklbw mm0, mm2 1.234 + punpcklbw mm1, mm3 1.235 + 1.236 + psadbw mm0, mm1 1.237 + lea rsi, [rsi+rax*2] 1.238 + 1.239 + lea rdi, [rdi+rdx*2] 1.240 + movd mm4, DWORD PTR [rsi] 1.241 + 1.242 + movd mm5, DWORD PTR [rdi] 1.243 + movd mm6, DWORD PTR [rsi+rax] 1.244 + 1.245 + movd mm7, DWORD PTR [rdi+rdx] 1.246 + punpcklbw mm4, mm6 1.247 + 1.248 + punpcklbw mm5, mm7 1.249 + psadbw mm4, mm5 1.250 + 1.251 + paddw mm0, mm4 1.252 + movq rax, mm0 1.253 + 1.254 + ; begin epilog 1.255 + pop rdi 1.256 + pop rsi 1.257 + UNSHADOW_ARGS 1.258 + pop rbp 1.259 + ret 1.260 + 1.261 + 1.262 +;unsigned int vp8_sad16x8_wmt( 1.263 +; unsigned char *src_ptr, 1.264 +; int src_stride, 1.265 +; unsigned char *ref_ptr, 1.266 +; int ref_stride) 1.267 +global sym(vp8_sad16x8_wmt) PRIVATE 1.268 +sym(vp8_sad16x8_wmt): 1.269 + push rbp 1.270 + mov rbp, rsp 1.271 + SHADOW_ARGS_TO_STACK 5 1.272 + push rbx 1.273 + push rsi 1.274 + push rdi 1.275 + ; end prolog 1.276 + 1.277 + 1.278 + mov rsi, arg(0) ;src_ptr 1.279 + mov rdi, arg(2) ;ref_ptr 1.280 + 1.281 + movsxd rbx, dword ptr arg(1) ;src_stride 1.282 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.283 + 1.284 + lea rcx, [rsi+rbx*8] 1.285 + pxor mm7, mm7 1.286 + 1.287 +.x16x8sad_wmt_loop: 1.288 + 1.289 + movq rax, mm7 1.290 + cmp eax, arg(4) 1.291 + ja .x16x8sad_wmt_early_exit 1.292 + 1.293 + movq mm0, QWORD PTR [rsi] 1.294 + movq mm2, QWORD PTR [rsi+8] 1.295 + 1.296 + movq mm1, QWORD PTR [rdi] 1.297 + movq mm3, QWORD PTR [rdi+8] 1.298 + 1.299 + movq mm4, QWORD PTR [rsi+rbx] 1.300 + movq mm5, QWORD PTR [rdi+rdx] 1.301 + 1.302 + psadbw mm0, mm1 1.303 + psadbw mm2, mm3 1.304 + 1.305 + movq mm1, QWORD PTR [rsi+rbx+8] 1.306 + movq mm3, QWORD PTR [rdi+rdx+8] 1.307 + 1.308 + psadbw mm4, mm5 1.309 + psadbw mm1, mm3 1.310 + 1.311 + lea rsi, [rsi+rbx*2] 1.312 + lea rdi, [rdi+rdx*2] 1.313 + 1.314 + paddw mm0, mm2 1.315 + paddw mm4, mm1 1.316 + 1.317 + paddw mm7, mm0 1.318 + paddw mm7, mm4 1.319 + 1.320 + cmp rsi, rcx 1.321 + jne .x16x8sad_wmt_loop 1.322 + 1.323 + movq rax, mm7 1.324 + 1.325 +.x16x8sad_wmt_early_exit: 1.326 + 1.327 + ; begin epilog 1.328 + pop rdi 1.329 + pop rsi 1.330 + pop rbx 1.331 + UNSHADOW_ARGS 1.332 + pop rbp 1.333 + ret 1.334 + 1.335 +;void vp8_copy32xn_sse2( 1.336 +; unsigned char *src_ptr, 1.337 +; int src_stride, 1.338 +; unsigned char *dst_ptr, 1.339 +; int dst_stride, 1.340 +; int height); 1.341 +global sym(vp8_copy32xn_sse2) PRIVATE 1.342 +sym(vp8_copy32xn_sse2): 1.343 + push rbp 1.344 + mov rbp, rsp 1.345 + SHADOW_ARGS_TO_STACK 5 1.346 + SAVE_XMM 7 1.347 + push rsi 1.348 + push rdi 1.349 + ; end prolog 1.350 + 1.351 + mov rsi, arg(0) ;src_ptr 1.352 + mov rdi, arg(2) ;dst_ptr 1.353 + 1.354 + movsxd rax, dword ptr arg(1) ;src_stride 1.355 + movsxd rdx, dword ptr arg(3) ;dst_stride 1.356 + movsxd rcx, dword ptr arg(4) ;height 1.357 + 1.358 +.block_copy_sse2_loopx4: 1.359 + movdqu xmm0, XMMWORD PTR [rsi] 1.360 + movdqu xmm1, XMMWORD PTR [rsi + 16] 1.361 + movdqu xmm2, XMMWORD PTR [rsi + rax] 1.362 + movdqu xmm3, XMMWORD PTR [rsi + rax + 16] 1.363 + 1.364 + lea rsi, [rsi+rax*2] 1.365 + 1.366 + movdqu xmm4, XMMWORD PTR [rsi] 1.367 + movdqu xmm5, XMMWORD PTR [rsi + 16] 1.368 + movdqu xmm6, XMMWORD PTR [rsi + rax] 1.369 + movdqu xmm7, XMMWORD PTR [rsi + rax + 16] 1.370 + 1.371 + lea rsi, [rsi+rax*2] 1.372 + 1.373 + movdqa XMMWORD PTR [rdi], xmm0 1.374 + movdqa XMMWORD PTR [rdi + 16], xmm1 1.375 + movdqa XMMWORD PTR [rdi + rdx], xmm2 1.376 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm3 1.377 + 1.378 + lea rdi, [rdi+rdx*2] 1.379 + 1.380 + movdqa XMMWORD PTR [rdi], xmm4 1.381 + movdqa XMMWORD PTR [rdi + 16], xmm5 1.382 + movdqa XMMWORD PTR [rdi + rdx], xmm6 1.383 + movdqa XMMWORD PTR [rdi + rdx + 16], xmm7 1.384 + 1.385 + lea rdi, [rdi+rdx*2] 1.386 + 1.387 + sub rcx, 4 1.388 + cmp rcx, 4 1.389 + jge .block_copy_sse2_loopx4 1.390 + 1.391 + cmp rcx, 0 1.392 + je .copy_is_done 1.393 + 1.394 +.block_copy_sse2_loop: 1.395 + movdqu xmm0, XMMWORD PTR [rsi] 1.396 + movdqu xmm1, XMMWORD PTR [rsi + 16] 1.397 + lea rsi, [rsi+rax] 1.398 + 1.399 + movdqa XMMWORD PTR [rdi], xmm0 1.400 + movdqa XMMWORD PTR [rdi + 16], xmm1 1.401 + lea rdi, [rdi+rdx] 1.402 + 1.403 + sub rcx, 1 1.404 + jne .block_copy_sse2_loop 1.405 + 1.406 +.copy_is_done: 1.407 + ; begin epilog 1.408 + pop rdi 1.409 + pop rsi 1.410 + RESTORE_XMM 1.411 + UNSHADOW_ARGS 1.412 + pop rbp 1.413 + ret