1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,427 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +global sym(vp9_sad16x16_mmx) PRIVATE 1.18 +global sym(vp9_sad8x16_mmx) PRIVATE 1.19 +global sym(vp9_sad8x8_mmx) PRIVATE 1.20 +global sym(vp9_sad4x4_mmx) PRIVATE 1.21 +global sym(vp9_sad16x8_mmx) PRIVATE 1.22 + 1.23 +;unsigned int vp9_sad16x16_mmx( 1.24 +; unsigned char *src_ptr, 1.25 +; int src_stride, 1.26 +; unsigned char *ref_ptr, 1.27 +; int ref_stride) 1.28 +sym(vp9_sad16x16_mmx): 1.29 + push rbp 1.30 + mov rbp, rsp 1.31 + SHADOW_ARGS_TO_STACK 4 1.32 + push rsi 1.33 + push rdi 1.34 + ; end prolog 1.35 + 1.36 + mov rsi, arg(0) ;src_ptr 1.37 + mov rdi, arg(2) ;ref_ptr 1.38 + 1.39 + movsxd rax, dword ptr arg(1) ;src_stride 1.40 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.41 + 1.42 + lea rcx, [rsi+rax*8] 1.43 + 1.44 + lea rcx, [rcx+rax*8] 1.45 + pxor mm7, mm7 1.46 + 1.47 + pxor mm6, mm6 1.48 + 1.49 +.x16x16sad_mmx_loop: 1.50 + 1.51 + movq mm0, QWORD PTR [rsi] 1.52 + movq mm2, QWORD PTR [rsi+8] 1.53 + 1.54 + movq mm1, QWORD PTR [rdi] 1.55 + movq mm3, QWORD PTR [rdi+8] 1.56 + 1.57 + movq mm4, mm0 1.58 + movq mm5, mm2 1.59 + 1.60 + psubusb mm0, mm1 1.61 + psubusb mm1, mm4 1.62 + 1.63 + psubusb mm2, mm3 1.64 + psubusb mm3, mm5 1.65 + 1.66 + por mm0, mm1 1.67 + por mm2, mm3 1.68 + 1.69 + movq mm1, mm0 1.70 + movq mm3, mm2 1.71 + 1.72 + punpcklbw mm0, mm6 1.73 + punpcklbw mm2, mm6 1.74 + 1.75 + punpckhbw mm1, mm6 1.76 + punpckhbw mm3, mm6 1.77 + 1.78 + paddw mm0, mm2 1.79 + paddw mm1, mm3 1.80 + 1.81 + 1.82 + lea rsi, [rsi+rax] 1.83 + add rdi, rdx 1.84 + 1.85 + paddw mm7, mm0 1.86 + paddw mm7, mm1 1.87 + 1.88 + cmp rsi, rcx 1.89 + jne .x16x16sad_mmx_loop 1.90 + 1.91 + 1.92 + movq mm0, mm7 1.93 + 1.94 + punpcklwd mm0, mm6 1.95 + punpckhwd mm7, mm6 1.96 + 1.97 + paddw mm0, mm7 1.98 + movq mm7, mm0 1.99 + 1.100 + 1.101 + psrlq mm0, 32 1.102 + paddw mm7, mm0 1.103 + 1.104 + movq rax, mm7 1.105 + 1.106 + pop rdi 1.107 + pop rsi 1.108 + mov rsp, rbp 1.109 + ; begin epilog 1.110 + UNSHADOW_ARGS 1.111 + pop rbp 1.112 + ret 1.113 + 1.114 + 1.115 +;unsigned int vp9_sad8x16_mmx( 1.116 +; unsigned char *src_ptr, 1.117 +; int src_stride, 1.118 +; unsigned char *ref_ptr, 1.119 +; int ref_stride) 1.120 +sym(vp9_sad8x16_mmx): 1.121 + push rbp 1.122 + mov rbp, rsp 1.123 + SHADOW_ARGS_TO_STACK 4 1.124 + push rsi 1.125 + push rdi 1.126 + ; end prolog 1.127 + 1.128 + mov rsi, arg(0) ;src_ptr 1.129 + mov rdi, arg(2) ;ref_ptr 1.130 + 1.131 + movsxd rax, dword ptr arg(1) ;src_stride 1.132 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.133 + 1.134 + lea rcx, [rsi+rax*8] 1.135 + 1.136 + lea rcx, [rcx+rax*8] 1.137 + pxor mm7, mm7 1.138 + 1.139 + pxor mm6, mm6 1.140 + 1.141 +.x8x16sad_mmx_loop: 1.142 + 1.143 + movq mm0, QWORD PTR [rsi] 1.144 + movq mm1, QWORD PTR [rdi] 1.145 + 1.146 + movq mm2, mm0 1.147 + psubusb mm0, mm1 1.148 + 1.149 + psubusb mm1, mm2 1.150 + por mm0, mm1 1.151 + 1.152 + movq mm2, mm0 1.153 + punpcklbw mm0, mm6 1.154 + 1.155 + punpckhbw mm2, mm6 1.156 + lea rsi, [rsi+rax] 1.157 + 1.158 + add rdi, rdx 1.159 + paddw mm7, mm0 1.160 + 1.161 + paddw mm7, mm2 1.162 + cmp rsi, rcx 1.163 + 1.164 + jne .x8x16sad_mmx_loop 1.165 + 1.166 + movq mm0, mm7 1.167 + punpcklwd mm0, mm6 1.168 + 1.169 + punpckhwd mm7, mm6 1.170 + paddw mm0, mm7 1.171 + 1.172 + movq mm7, mm0 1.173 + psrlq mm0, 32 1.174 + 1.175 + paddw mm7, mm0 1.176 + movq rax, mm7 1.177 + 1.178 + pop rdi 1.179 + pop rsi 1.180 + mov rsp, rbp 1.181 + ; begin epilog 1.182 + UNSHADOW_ARGS 1.183 + pop rbp 1.184 + ret 1.185 + 1.186 + 1.187 +;unsigned int vp9_sad8x8_mmx( 1.188 +; unsigned char *src_ptr, 1.189 +; int src_stride, 1.190 +; unsigned char *ref_ptr, 1.191 +; int ref_stride) 1.192 +sym(vp9_sad8x8_mmx): 1.193 + push rbp 1.194 + mov rbp, rsp 1.195 + SHADOW_ARGS_TO_STACK 4 1.196 + push rsi 1.197 + push rdi 1.198 + ; end prolog 1.199 + 1.200 + mov rsi, arg(0) ;src_ptr 1.201 + mov rdi, arg(2) ;ref_ptr 1.202 + 1.203 + movsxd rax, dword ptr arg(1) ;src_stride 1.204 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.205 + 1.206 + lea rcx, [rsi+rax*8] 1.207 + pxor mm7, mm7 1.208 + 1.209 + pxor mm6, mm6 1.210 + 1.211 +.x8x8sad_mmx_loop: 1.212 + 1.213 + movq mm0, QWORD PTR [rsi] 1.214 + movq mm1, QWORD PTR [rdi] 1.215 + 1.216 + movq mm2, mm0 1.217 + psubusb mm0, mm1 1.218 + 1.219 + psubusb mm1, mm2 1.220 + por mm0, mm1 1.221 + 1.222 + movq mm2, mm0 1.223 + punpcklbw mm0, mm6 1.224 + 1.225 + punpckhbw mm2, mm6 1.226 + paddw mm0, mm2 1.227 + 1.228 + lea rsi, [rsi+rax] 1.229 + add rdi, rdx 1.230 + 1.231 + paddw mm7, mm0 1.232 + cmp rsi, rcx 1.233 + 1.234 + jne .x8x8sad_mmx_loop 1.235 + 1.236 + movq mm0, mm7 1.237 + punpcklwd mm0, mm6 1.238 + 1.239 + punpckhwd mm7, mm6 1.240 + paddw mm0, mm7 1.241 + 1.242 + movq mm7, mm0 1.243 + psrlq mm0, 32 1.244 + 1.245 + paddw mm7, mm0 1.246 + movq rax, mm7 1.247 + 1.248 + pop rdi 1.249 + pop rsi 1.250 + mov rsp, rbp 1.251 + ; begin epilog 1.252 + UNSHADOW_ARGS 1.253 + pop rbp 1.254 + ret 1.255 + 1.256 + 1.257 +;unsigned int vp9_sad4x4_mmx( 1.258 +; unsigned char *src_ptr, 1.259 +; int src_stride, 1.260 +; unsigned char *ref_ptr, 1.261 +; int ref_stride) 1.262 +sym(vp9_sad4x4_mmx): 1.263 + push rbp 1.264 + mov rbp, rsp 1.265 + SHADOW_ARGS_TO_STACK 4 1.266 + push rsi 1.267 + push rdi 1.268 + ; end prolog 1.269 + 1.270 + mov rsi, arg(0) ;src_ptr 1.271 + mov rdi, arg(2) ;ref_ptr 1.272 + 1.273 + movsxd rax, dword ptr arg(1) ;src_stride 1.274 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.275 + 1.276 + movd mm0, DWORD PTR [rsi] 1.277 + movd mm1, DWORD PTR [rdi] 1.278 + 1.279 + movd mm2, DWORD PTR [rsi+rax] 1.280 + movd mm3, DWORD PTR [rdi+rdx] 1.281 + 1.282 + punpcklbw mm0, mm2 1.283 + punpcklbw mm1, mm3 1.284 + 1.285 + movq mm2, mm0 1.286 + psubusb mm0, mm1 1.287 + 1.288 + psubusb mm1, mm2 1.289 + por mm0, mm1 1.290 + 1.291 + movq mm2, mm0 1.292 + pxor mm3, mm3 1.293 + 1.294 + punpcklbw mm0, mm3 1.295 + punpckhbw mm2, mm3 1.296 + 1.297 + paddw mm0, mm2 1.298 + 1.299 + lea rsi, [rsi+rax*2] 1.300 + lea rdi, [rdi+rdx*2] 1.301 + 1.302 + movd mm4, DWORD PTR [rsi] 1.303 + movd mm5, DWORD PTR [rdi] 1.304 + 1.305 + movd mm6, DWORD PTR [rsi+rax] 1.306 + movd mm7, DWORD PTR [rdi+rdx] 1.307 + 1.308 + punpcklbw mm4, mm6 1.309 + punpcklbw mm5, mm7 1.310 + 1.311 + movq mm6, mm4 1.312 + psubusb mm4, mm5 1.313 + 1.314 + psubusb mm5, mm6 1.315 + por mm4, mm5 1.316 + 1.317 + movq mm5, mm4 1.318 + punpcklbw mm4, mm3 1.319 + 1.320 + punpckhbw mm5, mm3 1.321 + paddw mm4, mm5 1.322 + 1.323 + paddw mm0, mm4 1.324 + movq mm1, mm0 1.325 + 1.326 + punpcklwd mm0, mm3 1.327 + punpckhwd mm1, mm3 1.328 + 1.329 + paddw mm0, mm1 1.330 + movq mm1, mm0 1.331 + 1.332 + psrlq mm0, 32 1.333 + paddw mm0, mm1 1.334 + 1.335 + movq rax, mm0 1.336 + 1.337 + pop rdi 1.338 + pop rsi 1.339 + mov rsp, rbp 1.340 + ; begin epilog 1.341 + UNSHADOW_ARGS 1.342 + pop rbp 1.343 + ret 1.344 + 1.345 + 1.346 +;unsigned int vp9_sad16x8_mmx( 1.347 +; unsigned char *src_ptr, 1.348 +; int src_stride, 1.349 +; unsigned char *ref_ptr, 1.350 +; int ref_stride) 1.351 +sym(vp9_sad16x8_mmx): 1.352 + push rbp 1.353 + mov rbp, rsp 1.354 + SHADOW_ARGS_TO_STACK 4 1.355 + push rsi 1.356 + push rdi 1.357 + ; end prolog 1.358 + 1.359 + mov rsi, arg(0) ;src_ptr 1.360 + mov rdi, arg(2) ;ref_ptr 1.361 + 1.362 + movsxd rax, dword ptr arg(1) ;src_stride 1.363 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.364 + 1.365 + lea rcx, [rsi+rax*8] 1.366 + pxor mm7, mm7 1.367 + 1.368 + pxor mm6, mm6 1.369 + 1.370 +.x16x8sad_mmx_loop: 1.371 + 1.372 + movq mm0, [rsi] 1.373 + movq mm1, [rdi] 1.374 + 1.375 + movq mm2, [rsi+8] 1.376 + movq mm3, [rdi+8] 1.377 + 1.378 + movq mm4, mm0 1.379 + movq mm5, mm2 1.380 + 1.381 + psubusb mm0, mm1 1.382 + psubusb mm1, mm4 1.383 + 1.384 + psubusb mm2, mm3 1.385 + psubusb mm3, mm5 1.386 + 1.387 + por mm0, mm1 1.388 + por mm2, mm3 1.389 + 1.390 + movq mm1, mm0 1.391 + movq mm3, mm2 1.392 + 1.393 + punpcklbw mm0, mm6 1.394 + punpckhbw mm1, mm6 1.395 + 1.396 + punpcklbw mm2, mm6 1.397 + punpckhbw mm3, mm6 1.398 + 1.399 + 1.400 + paddw mm0, mm2 1.401 + paddw mm1, mm3 1.402 + 1.403 + paddw mm0, mm1 1.404 + lea rsi, [rsi+rax] 1.405 + 1.406 + add rdi, rdx 1.407 + paddw mm7, mm0 1.408 + 1.409 + cmp rsi, rcx 1.410 + jne .x16x8sad_mmx_loop 1.411 + 1.412 + movq mm0, mm7 1.413 + punpcklwd mm0, mm6 1.414 + 1.415 + punpckhwd mm7, mm6 1.416 + paddw mm0, mm7 1.417 + 1.418 + movq mm7, mm0 1.419 + psrlq mm0, 32 1.420 + 1.421 + paddw mm7, mm0 1.422 + movq rax, mm7 1.423 + 1.424 + pop rdi 1.425 + pop rsi 1.426 + mov rsp, rbp 1.427 + ; begin epilog 1.428 + UNSHADOW_ARGS 1.429 + pop rbp 1.430 + ret