1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/encoder/x86/vp9_sad_sse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,378 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 +%include "vpx_ports/x86_abi_support.asm" 1.15 + 1.16 +%macro STACK_FRAME_CREATE_X3 0 1.17 +%if ABI_IS_32BIT 1.18 + %define src_ptr rsi 1.19 + %define src_stride rax 1.20 + %define ref_ptr rdi 1.21 + %define ref_stride rdx 1.22 + %define end_ptr rcx 1.23 + %define ret_var rbx 1.24 + %define result_ptr arg(4) 1.25 + %define max_err arg(4) 1.26 + %define height dword ptr arg(4) 1.27 + push rbp 1.28 + mov rbp, rsp 1.29 + push rsi 1.30 + push rdi 1.31 + push rbx 1.32 + 1.33 + mov rsi, arg(0) ; src_ptr 1.34 + mov rdi, arg(2) ; ref_ptr 1.35 + 1.36 + movsxd rax, dword ptr arg(1) ; src_stride 1.37 + movsxd rdx, dword ptr arg(3) ; ref_stride 1.38 +%else 1.39 + %if LIBVPX_YASM_WIN64 1.40 + SAVE_XMM 7, u 1.41 + %define src_ptr rcx 1.42 + %define src_stride rdx 1.43 + %define ref_ptr r8 1.44 + %define ref_stride r9 1.45 + %define end_ptr r10 1.46 + %define ret_var r11 1.47 + %define result_ptr [rsp+xmm_stack_space+8+4*8] 1.48 + %define max_err [rsp+xmm_stack_space+8+4*8] 1.49 + %define height dword ptr [rsp+xmm_stack_space+8+4*8] 1.50 + %else 1.51 + %define src_ptr rdi 1.52 + %define src_stride rsi 1.53 + %define ref_ptr rdx 1.54 + %define ref_stride rcx 1.55 + %define end_ptr r9 1.56 + %define ret_var r10 1.57 + %define result_ptr r8 1.58 + %define max_err r8 1.59 + %define height r8 1.60 + %endif 1.61 +%endif 1.62 + 1.63 +%endmacro 1.64 + 1.65 +%macro STACK_FRAME_DESTROY_X3 0 1.66 + %define src_ptr 1.67 + %define src_stride 1.68 + %define ref_ptr 1.69 + %define ref_stride 1.70 + %define end_ptr 1.71 + %define ret_var 1.72 + %define result_ptr 1.73 + %define max_err 1.74 + %define height 1.75 + 1.76 +%if ABI_IS_32BIT 1.77 + pop rbx 1.78 + pop rdi 1.79 + pop rsi 1.80 + pop rbp 1.81 +%else 1.82 + %if LIBVPX_YASM_WIN64 1.83 + RESTORE_XMM 1.84 + %endif 1.85 +%endif 1.86 + ret 1.87 +%endmacro 1.88 + 1.89 +%macro PROCESS_16X2X3 5 1.90 +%if %1==0 1.91 + movdqa xmm0, XMMWORD PTR [%2] 1.92 + lddqu xmm5, XMMWORD PTR [%3] 1.93 + lddqu xmm6, XMMWORD PTR [%3+1] 1.94 + lddqu xmm7, XMMWORD PTR [%3+2] 1.95 + 1.96 + psadbw xmm5, xmm0 1.97 + psadbw xmm6, xmm0 1.98 + psadbw xmm7, xmm0 1.99 +%else 1.100 + movdqa xmm0, XMMWORD PTR [%2] 1.101 + lddqu xmm1, XMMWORD PTR [%3] 1.102 + lddqu xmm2, XMMWORD PTR [%3+1] 1.103 + lddqu xmm3, XMMWORD PTR [%3+2] 1.104 + 1.105 + psadbw xmm1, xmm0 1.106 + psadbw xmm2, xmm0 1.107 + psadbw xmm3, xmm0 1.108 + 1.109 + paddw xmm5, xmm1 1.110 + paddw xmm6, xmm2 1.111 + paddw xmm7, xmm3 1.112 +%endif 1.113 + movdqa xmm0, XMMWORD PTR [%2+%4] 1.114 + lddqu xmm1, XMMWORD PTR [%3+%5] 1.115 + lddqu xmm2, XMMWORD PTR [%3+%5+1] 1.116 + lddqu xmm3, XMMWORD PTR [%3+%5+2] 1.117 + 1.118 +%if %1==0 || %1==1 1.119 + lea %2, [%2+%4*2] 1.120 + lea %3, [%3+%5*2] 1.121 +%endif 1.122 + 1.123 + psadbw xmm1, xmm0 1.124 + psadbw xmm2, xmm0 1.125 + psadbw xmm3, xmm0 1.126 + 1.127 + paddw xmm5, xmm1 1.128 + paddw xmm6, xmm2 1.129 + paddw xmm7, xmm3 1.130 +%endmacro 1.131 + 1.132 +%macro PROCESS_8X2X3 5 1.133 +%if %1==0 1.134 + movq mm0, QWORD PTR [%2] 1.135 + movq mm5, QWORD PTR [%3] 1.136 + movq mm6, QWORD PTR [%3+1] 1.137 + movq mm7, QWORD PTR [%3+2] 1.138 + 1.139 + psadbw mm5, mm0 1.140 + psadbw mm6, mm0 1.141 + psadbw mm7, mm0 1.142 +%else 1.143 + movq mm0, QWORD PTR [%2] 1.144 + movq mm1, QWORD PTR [%3] 1.145 + movq mm2, QWORD PTR [%3+1] 1.146 + movq mm3, QWORD PTR [%3+2] 1.147 + 1.148 + psadbw mm1, mm0 1.149 + psadbw mm2, mm0 1.150 + psadbw mm3, mm0 1.151 + 1.152 + paddw mm5, mm1 1.153 + paddw mm6, mm2 1.154 + paddw mm7, mm3 1.155 +%endif 1.156 + movq mm0, QWORD PTR [%2+%4] 1.157 + movq mm1, QWORD PTR [%3+%5] 1.158 + movq mm2, QWORD PTR [%3+%5+1] 1.159 + movq mm3, QWORD PTR [%3+%5+2] 1.160 + 1.161 +%if %1==0 || %1==1 1.162 + lea %2, [%2+%4*2] 1.163 + lea %3, [%3+%5*2] 1.164 +%endif 1.165 + 1.166 + psadbw mm1, mm0 1.167 + psadbw mm2, mm0 1.168 + psadbw mm3, mm0 1.169 + 1.170 + paddw mm5, mm1 1.171 + paddw mm6, mm2 1.172 + paddw mm7, mm3 1.173 +%endmacro 1.174 + 1.175 +;void int vp9_sad16x16x3_sse3( 1.176 +; unsigned char *src_ptr, 1.177 +; int src_stride, 1.178 +; unsigned char *ref_ptr, 1.179 +; int ref_stride, 1.180 +; int *results) 1.181 +global sym(vp9_sad16x16x3_sse3) PRIVATE 1.182 +sym(vp9_sad16x16x3_sse3): 1.183 + 1.184 + STACK_FRAME_CREATE_X3 1.185 + 1.186 + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.187 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.188 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.189 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.190 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.191 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.192 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.193 + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.194 + 1.195 + mov rcx, result_ptr 1.196 + 1.197 + movq xmm0, xmm5 1.198 + psrldq xmm5, 8 1.199 + 1.200 + paddw xmm0, xmm5 1.201 + movd [rcx], xmm0 1.202 +;- 1.203 + movq xmm0, xmm6 1.204 + psrldq xmm6, 8 1.205 + 1.206 + paddw xmm0, xmm6 1.207 + movd [rcx+4], xmm0 1.208 +;- 1.209 + movq xmm0, xmm7 1.210 + psrldq xmm7, 8 1.211 + 1.212 + paddw xmm0, xmm7 1.213 + movd [rcx+8], xmm0 1.214 + 1.215 + STACK_FRAME_DESTROY_X3 1.216 + 1.217 +;void int vp9_sad16x8x3_sse3( 1.218 +; unsigned char *src_ptr, 1.219 +; int src_stride, 1.220 +; unsigned char *ref_ptr, 1.221 +; int ref_stride, 1.222 +; int *results) 1.223 +global sym(vp9_sad16x8x3_sse3) PRIVATE 1.224 +sym(vp9_sad16x8x3_sse3): 1.225 + 1.226 + STACK_FRAME_CREATE_X3 1.227 + 1.228 + PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.229 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.230 + PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.231 + PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.232 + 1.233 + mov rcx, result_ptr 1.234 + 1.235 + movq xmm0, xmm5 1.236 + psrldq xmm5, 8 1.237 + 1.238 + paddw xmm0, xmm5 1.239 + movd [rcx], xmm0 1.240 +;- 1.241 + movq xmm0, xmm6 1.242 + psrldq xmm6, 8 1.243 + 1.244 + paddw xmm0, xmm6 1.245 + movd [rcx+4], xmm0 1.246 +;- 1.247 + movq xmm0, xmm7 1.248 + psrldq xmm7, 8 1.249 + 1.250 + paddw xmm0, xmm7 1.251 + movd [rcx+8], xmm0 1.252 + 1.253 + STACK_FRAME_DESTROY_X3 1.254 + 1.255 +;void int vp9_sad8x16x3_sse3( 1.256 +; unsigned char *src_ptr, 1.257 +; int src_stride, 1.258 +; unsigned char *ref_ptr, 1.259 +; int ref_stride, 1.260 +; int *results) 1.261 +global sym(vp9_sad8x16x3_sse3) PRIVATE 1.262 +sym(vp9_sad8x16x3_sse3): 1.263 + 1.264 + STACK_FRAME_CREATE_X3 1.265 + 1.266 + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.267 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.268 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.269 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.270 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.271 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.272 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.273 + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.274 + 1.275 + mov rcx, result_ptr 1.276 + 1.277 + punpckldq mm5, mm6 1.278 + 1.279 + movq [rcx], mm5 1.280 + movd [rcx+8], mm7 1.281 + 1.282 + STACK_FRAME_DESTROY_X3 1.283 + 1.284 +;void int vp9_sad8x8x3_sse3( 1.285 +; unsigned char *src_ptr, 1.286 +; int src_stride, 1.287 +; unsigned char *ref_ptr, 1.288 +; int ref_stride, 1.289 +; int *results) 1.290 +global sym(vp9_sad8x8x3_sse3) PRIVATE 1.291 +sym(vp9_sad8x8x3_sse3): 1.292 + 1.293 + STACK_FRAME_CREATE_X3 1.294 + 1.295 + PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride 1.296 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.297 + PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride 1.298 + PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride 1.299 + 1.300 + mov rcx, result_ptr 1.301 + 1.302 + punpckldq mm5, mm6 1.303 + 1.304 + movq [rcx], mm5 1.305 + movd [rcx+8], mm7 1.306 + 1.307 + STACK_FRAME_DESTROY_X3 1.308 + 1.309 +;void int vp9_sad4x4x3_sse3( 1.310 +; unsigned char *src_ptr, 1.311 +; int src_stride, 1.312 +; unsigned char *ref_ptr, 1.313 +; int ref_stride, 1.314 +; int *results) 1.315 +global sym(vp9_sad4x4x3_sse3) PRIVATE 1.316 +sym(vp9_sad4x4x3_sse3): 1.317 + 1.318 + STACK_FRAME_CREATE_X3 1.319 + 1.320 + movd mm0, DWORD PTR [src_ptr] 1.321 + movd mm1, DWORD PTR [ref_ptr] 1.322 + 1.323 + movd mm2, DWORD PTR [src_ptr+src_stride] 1.324 + movd mm3, DWORD PTR [ref_ptr+ref_stride] 1.325 + 1.326 + punpcklbw mm0, mm2 1.327 + punpcklbw mm1, mm3 1.328 + 1.329 + movd mm4, DWORD PTR [ref_ptr+1] 1.330 + movd mm5, DWORD PTR [ref_ptr+2] 1.331 + 1.332 + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 1.333 + movd mm3, DWORD PTR [ref_ptr+ref_stride+2] 1.334 + 1.335 + psadbw mm1, mm0 1.336 + 1.337 + punpcklbw mm4, mm2 1.338 + punpcklbw mm5, mm3 1.339 + 1.340 + psadbw mm4, mm0 1.341 + psadbw mm5, mm0 1.342 + 1.343 + lea src_ptr, [src_ptr+src_stride*2] 1.344 + lea ref_ptr, [ref_ptr+ref_stride*2] 1.345 + 1.346 + movd mm0, DWORD PTR [src_ptr] 1.347 + movd mm2, DWORD PTR [ref_ptr] 1.348 + 1.349 + movd mm3, DWORD PTR [src_ptr+src_stride] 1.350 + movd mm6, DWORD PTR [ref_ptr+ref_stride] 1.351 + 1.352 + punpcklbw mm0, mm3 1.353 + punpcklbw mm2, mm6 1.354 + 1.355 + movd mm3, DWORD PTR [ref_ptr+1] 1.356 + movd mm7, DWORD PTR [ref_ptr+2] 1.357 + 1.358 + psadbw mm2, mm0 1.359 + 1.360 + paddw mm1, mm2 1.361 + 1.362 + movd mm2, DWORD PTR [ref_ptr+ref_stride+1] 1.363 + movd mm6, DWORD PTR [ref_ptr+ref_stride+2] 1.364 + 1.365 + punpcklbw mm3, mm2 1.366 + punpcklbw mm7, mm6 1.367 + 1.368 + psadbw mm3, mm0 1.369 + psadbw mm7, mm0 1.370 + 1.371 + paddw mm3, mm4 1.372 + paddw mm7, mm5 1.373 + 1.374 + mov rcx, result_ptr 1.375 + 1.376 + punpckldq mm1, mm3 1.377 + 1.378 + movq [rcx], mm1 1.379 + movd [rcx+8], mm7 1.380 + 1.381 + STACK_FRAME_DESTROY_X3