1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/sad_ssse3.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,370 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%macro PROCESS_16X2X3 1 1.18 +%if %1 1.19 + movdqa xmm0, XMMWORD PTR [rsi] 1.20 + lddqu xmm5, XMMWORD PTR [rdi] 1.21 + lddqu xmm6, XMMWORD PTR [rdi+1] 1.22 + lddqu xmm7, XMMWORD PTR [rdi+2] 1.23 + 1.24 + psadbw xmm5, xmm0 1.25 + psadbw xmm6, xmm0 1.26 + psadbw xmm7, xmm0 1.27 +%else 1.28 + movdqa xmm0, XMMWORD PTR [rsi] 1.29 + lddqu xmm1, XMMWORD PTR [rdi] 1.30 + lddqu xmm2, XMMWORD PTR [rdi+1] 1.31 + lddqu xmm3, XMMWORD PTR [rdi+2] 1.32 + 1.33 + psadbw xmm1, xmm0 1.34 + psadbw xmm2, xmm0 1.35 + psadbw xmm3, xmm0 1.36 + 1.37 + paddw xmm5, xmm1 1.38 + paddw xmm6, xmm2 1.39 + paddw xmm7, xmm3 1.40 +%endif 1.41 + movdqa xmm0, XMMWORD PTR [rsi+rax] 1.42 + lddqu xmm1, XMMWORD PTR [rdi+rdx] 1.43 + lddqu xmm2, XMMWORD PTR [rdi+rdx+1] 1.44 + lddqu xmm3, XMMWORD PTR [rdi+rdx+2] 1.45 + 1.46 + lea rsi, [rsi+rax*2] 1.47 + lea rdi, [rdi+rdx*2] 1.48 + 1.49 + psadbw xmm1, xmm0 1.50 + psadbw xmm2, xmm0 1.51 + psadbw xmm3, xmm0 1.52 + 1.53 + paddw xmm5, xmm1 1.54 + paddw xmm6, xmm2 1.55 + paddw xmm7, xmm3 1.56 +%endmacro 1.57 + 1.58 +%macro PROCESS_16X2X3_OFFSET 2 1.59 +%if %1 1.60 + movdqa xmm0, XMMWORD PTR [rsi] 1.61 + movdqa xmm4, XMMWORD PTR [rdi] 1.62 + movdqa xmm7, XMMWORD PTR [rdi+16] 1.63 + 1.64 + movdqa xmm5, xmm7 1.65 + palignr xmm5, xmm4, %2 1.66 + 1.67 + movdqa xmm6, xmm7 1.68 + palignr xmm6, xmm4, (%2+1) 1.69 + 1.70 + palignr xmm7, xmm4, (%2+2) 1.71 + 1.72 + psadbw xmm5, xmm0 1.73 + psadbw xmm6, xmm0 1.74 + psadbw xmm7, xmm0 1.75 +%else 1.76 + movdqa xmm0, XMMWORD PTR [rsi] 1.77 + movdqa xmm4, XMMWORD PTR [rdi] 1.78 + movdqa xmm3, XMMWORD PTR [rdi+16] 1.79 + 1.80 + movdqa xmm1, xmm3 1.81 + palignr xmm1, xmm4, %2 1.82 + 1.83 + movdqa xmm2, xmm3 1.84 + palignr xmm2, xmm4, (%2+1) 1.85 + 1.86 + palignr xmm3, xmm4, (%2+2) 1.87 + 1.88 + psadbw xmm1, xmm0 1.89 + psadbw xmm2, xmm0 1.90 + psadbw xmm3, xmm0 1.91 + 1.92 + paddw xmm5, xmm1 1.93 + paddw xmm6, xmm2 1.94 + paddw xmm7, xmm3 1.95 +%endif 1.96 + movdqa xmm0, XMMWORD PTR [rsi+rax] 1.97 + movdqa xmm4, XMMWORD PTR [rdi+rdx] 1.98 + movdqa xmm3, XMMWORD PTR [rdi+rdx+16] 1.99 + 1.100 + movdqa xmm1, xmm3 1.101 + palignr xmm1, xmm4, %2 1.102 + 1.103 + movdqa xmm2, xmm3 1.104 + palignr xmm2, xmm4, (%2+1) 1.105 + 1.106 + palignr xmm3, xmm4, (%2+2) 1.107 + 1.108 + lea rsi, [rsi+rax*2] 1.109 + lea rdi, [rdi+rdx*2] 1.110 + 1.111 + psadbw xmm1, xmm0 1.112 + psadbw xmm2, xmm0 1.113 + psadbw xmm3, xmm0 1.114 + 1.115 + paddw xmm5, xmm1 1.116 + paddw xmm6, xmm2 1.117 + paddw xmm7, xmm3 1.118 +%endmacro 1.119 + 1.120 +%macro PROCESS_16X16X3_OFFSET 2 1.121 +%2_aligned_by_%1: 1.122 + 1.123 + sub rdi, %1 1.124 + 1.125 + PROCESS_16X2X3_OFFSET 1, %1 1.126 + PROCESS_16X2X3_OFFSET 0, %1 1.127 + PROCESS_16X2X3_OFFSET 0, %1 1.128 + PROCESS_16X2X3_OFFSET 0, %1 1.129 + PROCESS_16X2X3_OFFSET 0, %1 1.130 + PROCESS_16X2X3_OFFSET 0, %1 1.131 + PROCESS_16X2X3_OFFSET 0, %1 1.132 + PROCESS_16X2X3_OFFSET 0, %1 1.133 + 1.134 + jmp %2_store_off 1.135 + 1.136 +%endmacro 1.137 + 1.138 +%macro PROCESS_16X8X3_OFFSET 2 1.139 +%2_aligned_by_%1: 1.140 + 1.141 + sub rdi, %1 1.142 + 1.143 + PROCESS_16X2X3_OFFSET 1, %1 1.144 + PROCESS_16X2X3_OFFSET 0, %1 1.145 + PROCESS_16X2X3_OFFSET 0, %1 1.146 + PROCESS_16X2X3_OFFSET 0, %1 1.147 + 1.148 + jmp %2_store_off 1.149 + 1.150 +%endmacro 1.151 + 1.152 +;void int vp8_sad16x16x3_ssse3( 1.153 +; unsigned char *src_ptr, 1.154 +; int src_stride, 1.155 +; unsigned char *ref_ptr, 1.156 +; int ref_stride, 1.157 +; int *results) 1.158 +global sym(vp8_sad16x16x3_ssse3) PRIVATE 1.159 +sym(vp8_sad16x16x3_ssse3): 1.160 + push rbp 1.161 + mov rbp, rsp 1.162 + SHADOW_ARGS_TO_STACK 5 1.163 + SAVE_XMM 7 1.164 + push rsi 1.165 + push rdi 1.166 + push rcx 1.167 + ; end prolog 1.168 + 1.169 + mov rsi, arg(0) ;src_ptr 1.170 + mov rdi, arg(2) ;ref_ptr 1.171 + 1.172 + mov rdx, 0xf 1.173 + and rdx, rdi 1.174 + 1.175 + jmp .vp8_sad16x16x3_ssse3_skiptable 1.176 +.vp8_sad16x16x3_ssse3_jumptable: 1.177 + dd .vp8_sad16x16x3_ssse3_aligned_by_0 - .vp8_sad16x16x3_ssse3_do_jump 1.178 + dd .vp8_sad16x16x3_ssse3_aligned_by_1 - .vp8_sad16x16x3_ssse3_do_jump 1.179 + dd .vp8_sad16x16x3_ssse3_aligned_by_2 - .vp8_sad16x16x3_ssse3_do_jump 1.180 + dd .vp8_sad16x16x3_ssse3_aligned_by_3 - .vp8_sad16x16x3_ssse3_do_jump 1.181 + dd .vp8_sad16x16x3_ssse3_aligned_by_4 - .vp8_sad16x16x3_ssse3_do_jump 1.182 + dd .vp8_sad16x16x3_ssse3_aligned_by_5 - .vp8_sad16x16x3_ssse3_do_jump 1.183 + dd .vp8_sad16x16x3_ssse3_aligned_by_6 - .vp8_sad16x16x3_ssse3_do_jump 1.184 + dd .vp8_sad16x16x3_ssse3_aligned_by_7 - .vp8_sad16x16x3_ssse3_do_jump 1.185 + dd .vp8_sad16x16x3_ssse3_aligned_by_8 - .vp8_sad16x16x3_ssse3_do_jump 1.186 + dd .vp8_sad16x16x3_ssse3_aligned_by_9 - .vp8_sad16x16x3_ssse3_do_jump 1.187 + dd .vp8_sad16x16x3_ssse3_aligned_by_10 - .vp8_sad16x16x3_ssse3_do_jump 1.188 + dd .vp8_sad16x16x3_ssse3_aligned_by_11 - .vp8_sad16x16x3_ssse3_do_jump 1.189 + dd .vp8_sad16x16x3_ssse3_aligned_by_12 - .vp8_sad16x16x3_ssse3_do_jump 1.190 + dd .vp8_sad16x16x3_ssse3_aligned_by_13 - .vp8_sad16x16x3_ssse3_do_jump 1.191 + dd .vp8_sad16x16x3_ssse3_aligned_by_14 - .vp8_sad16x16x3_ssse3_do_jump 1.192 + dd .vp8_sad16x16x3_ssse3_aligned_by_15 - .vp8_sad16x16x3_ssse3_do_jump 1.193 +.vp8_sad16x16x3_ssse3_skiptable: 1.194 + 1.195 + call .vp8_sad16x16x3_ssse3_do_jump 1.196 +.vp8_sad16x16x3_ssse3_do_jump: 1.197 + pop rcx ; get the address of do_jump 1.198 + mov rax, .vp8_sad16x16x3_ssse3_jumptable - .vp8_sad16x16x3_ssse3_do_jump 1.199 + add rax, rcx ; get the absolute address of vp8_sad16x16x3_ssse3_jumptable 1.200 + 1.201 + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 1.202 + add rcx, rax 1.203 + 1.204 + movsxd rax, dword ptr arg(1) ;src_stride 1.205 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.206 + 1.207 + jmp rcx 1.208 + 1.209 + PROCESS_16X16X3_OFFSET 0, .vp8_sad16x16x3_ssse3 1.210 + PROCESS_16X16X3_OFFSET 1, .vp8_sad16x16x3_ssse3 1.211 + PROCESS_16X16X3_OFFSET 2, .vp8_sad16x16x3_ssse3 1.212 + PROCESS_16X16X3_OFFSET 3, .vp8_sad16x16x3_ssse3 1.213 + PROCESS_16X16X3_OFFSET 4, .vp8_sad16x16x3_ssse3 1.214 + PROCESS_16X16X3_OFFSET 5, .vp8_sad16x16x3_ssse3 1.215 + PROCESS_16X16X3_OFFSET 6, .vp8_sad16x16x3_ssse3 1.216 + PROCESS_16X16X3_OFFSET 7, .vp8_sad16x16x3_ssse3 1.217 + PROCESS_16X16X3_OFFSET 8, .vp8_sad16x16x3_ssse3 1.218 + PROCESS_16X16X3_OFFSET 9, .vp8_sad16x16x3_ssse3 1.219 + PROCESS_16X16X3_OFFSET 10, .vp8_sad16x16x3_ssse3 1.220 + PROCESS_16X16X3_OFFSET 11, .vp8_sad16x16x3_ssse3 1.221 + PROCESS_16X16X3_OFFSET 12, .vp8_sad16x16x3_ssse3 1.222 + PROCESS_16X16X3_OFFSET 13, .vp8_sad16x16x3_ssse3 1.223 + PROCESS_16X16X3_OFFSET 14, .vp8_sad16x16x3_ssse3 1.224 + 1.225 +.vp8_sad16x16x3_ssse3_aligned_by_15: 1.226 + PROCESS_16X2X3 1 1.227 + PROCESS_16X2X3 0 1.228 + PROCESS_16X2X3 0 1.229 + PROCESS_16X2X3 0 1.230 + PROCESS_16X2X3 0 1.231 + PROCESS_16X2X3 0 1.232 + PROCESS_16X2X3 0 1.233 + PROCESS_16X2X3 0 1.234 + 1.235 +.vp8_sad16x16x3_ssse3_store_off: 1.236 + mov rdi, arg(4) ;Results 1.237 + 1.238 + movq xmm0, xmm5 1.239 + psrldq xmm5, 8 1.240 + 1.241 + paddw xmm0, xmm5 1.242 + movd [rdi], xmm0 1.243 +;- 1.244 + movq xmm0, xmm6 1.245 + psrldq xmm6, 8 1.246 + 1.247 + paddw xmm0, xmm6 1.248 + movd [rdi+4], xmm0 1.249 +;- 1.250 + movq xmm0, xmm7 1.251 + psrldq xmm7, 8 1.252 + 1.253 + paddw xmm0, xmm7 1.254 + movd [rdi+8], xmm0 1.255 + 1.256 + ; begin epilog 1.257 + pop rcx 1.258 + pop rdi 1.259 + pop rsi 1.260 + RESTORE_XMM 1.261 + UNSHADOW_ARGS 1.262 + pop rbp 1.263 + ret 1.264 + 1.265 +;void int vp8_sad16x8x3_ssse3( 1.266 +; unsigned char *src_ptr, 1.267 +; int src_stride, 1.268 +; unsigned char *ref_ptr, 1.269 +; int ref_stride, 1.270 +; int *results) 1.271 +global sym(vp8_sad16x8x3_ssse3) PRIVATE 1.272 +sym(vp8_sad16x8x3_ssse3): 1.273 + push rbp 1.274 + mov rbp, rsp 1.275 + SHADOW_ARGS_TO_STACK 5 1.276 + SAVE_XMM 7 1.277 + push rsi 1.278 + push rdi 1.279 + push rcx 1.280 + ; end prolog 1.281 + 1.282 + mov rsi, arg(0) ;src_ptr 1.283 + mov rdi, arg(2) ;ref_ptr 1.284 + 1.285 + mov rdx, 0xf 1.286 + and rdx, rdi 1.287 + 1.288 + jmp .vp8_sad16x8x3_ssse3_skiptable 1.289 +.vp8_sad16x8x3_ssse3_jumptable: 1.290 + dd .vp8_sad16x8x3_ssse3_aligned_by_0 - .vp8_sad16x8x3_ssse3_do_jump 1.291 + dd .vp8_sad16x8x3_ssse3_aligned_by_1 - .vp8_sad16x8x3_ssse3_do_jump 1.292 + dd .vp8_sad16x8x3_ssse3_aligned_by_2 - .vp8_sad16x8x3_ssse3_do_jump 1.293 + dd .vp8_sad16x8x3_ssse3_aligned_by_3 - .vp8_sad16x8x3_ssse3_do_jump 1.294 + dd .vp8_sad16x8x3_ssse3_aligned_by_4 - .vp8_sad16x8x3_ssse3_do_jump 1.295 + dd .vp8_sad16x8x3_ssse3_aligned_by_5 - .vp8_sad16x8x3_ssse3_do_jump 1.296 + dd .vp8_sad16x8x3_ssse3_aligned_by_6 - .vp8_sad16x8x3_ssse3_do_jump 1.297 + dd .vp8_sad16x8x3_ssse3_aligned_by_7 - .vp8_sad16x8x3_ssse3_do_jump 1.298 + dd .vp8_sad16x8x3_ssse3_aligned_by_8 - .vp8_sad16x8x3_ssse3_do_jump 1.299 + dd .vp8_sad16x8x3_ssse3_aligned_by_9 - .vp8_sad16x8x3_ssse3_do_jump 1.300 + dd .vp8_sad16x8x3_ssse3_aligned_by_10 - .vp8_sad16x8x3_ssse3_do_jump 1.301 + dd .vp8_sad16x8x3_ssse3_aligned_by_11 - .vp8_sad16x8x3_ssse3_do_jump 1.302 + dd .vp8_sad16x8x3_ssse3_aligned_by_12 - .vp8_sad16x8x3_ssse3_do_jump 1.303 + dd .vp8_sad16x8x3_ssse3_aligned_by_13 - .vp8_sad16x8x3_ssse3_do_jump 1.304 + dd .vp8_sad16x8x3_ssse3_aligned_by_14 - .vp8_sad16x8x3_ssse3_do_jump 1.305 + dd .vp8_sad16x8x3_ssse3_aligned_by_15 - .vp8_sad16x8x3_ssse3_do_jump 1.306 +.vp8_sad16x8x3_ssse3_skiptable: 1.307 + 1.308 + call .vp8_sad16x8x3_ssse3_do_jump 1.309 +.vp8_sad16x8x3_ssse3_do_jump: 1.310 + pop rcx ; get the address of do_jump 1.311 + mov rax, .vp8_sad16x8x3_ssse3_jumptable - .vp8_sad16x8x3_ssse3_do_jump 1.312 + add rax, rcx ; get the absolute address of vp8_sad16x8x3_ssse3_jumptable 1.313 + 1.314 + movsxd rax, dword [rax + 4*rdx] ; get the 32 bit offset from the jumptable 1.315 + add rcx, rax 1.316 + 1.317 + movsxd rax, dword ptr arg(1) ;src_stride 1.318 + movsxd rdx, dword ptr arg(3) ;ref_stride 1.319 + 1.320 + jmp rcx 1.321 + 1.322 + PROCESS_16X8X3_OFFSET 0, .vp8_sad16x8x3_ssse3 1.323 + PROCESS_16X8X3_OFFSET 1, .vp8_sad16x8x3_ssse3 1.324 + PROCESS_16X8X3_OFFSET 2, .vp8_sad16x8x3_ssse3 1.325 + PROCESS_16X8X3_OFFSET 3, .vp8_sad16x8x3_ssse3 1.326 + PROCESS_16X8X3_OFFSET 4, .vp8_sad16x8x3_ssse3 1.327 + PROCESS_16X8X3_OFFSET 5, .vp8_sad16x8x3_ssse3 1.328 + PROCESS_16X8X3_OFFSET 6, .vp8_sad16x8x3_ssse3 1.329 + PROCESS_16X8X3_OFFSET 7, .vp8_sad16x8x3_ssse3 1.330 + PROCESS_16X8X3_OFFSET 8, .vp8_sad16x8x3_ssse3 1.331 + PROCESS_16X8X3_OFFSET 9, .vp8_sad16x8x3_ssse3 1.332 + PROCESS_16X8X3_OFFSET 10, .vp8_sad16x8x3_ssse3 1.333 + PROCESS_16X8X3_OFFSET 11, .vp8_sad16x8x3_ssse3 1.334 + PROCESS_16X8X3_OFFSET 12, .vp8_sad16x8x3_ssse3 1.335 + PROCESS_16X8X3_OFFSET 13, .vp8_sad16x8x3_ssse3 1.336 + PROCESS_16X8X3_OFFSET 14, .vp8_sad16x8x3_ssse3 1.337 + 1.338 +.vp8_sad16x8x3_ssse3_aligned_by_15: 1.339 + 1.340 + PROCESS_16X2X3 1 1.341 + PROCESS_16X2X3 0 1.342 + PROCESS_16X2X3 0 1.343 + PROCESS_16X2X3 0 1.344 + 1.345 +.vp8_sad16x8x3_ssse3_store_off: 1.346 + mov rdi, arg(4) ;Results 1.347 + 1.348 + movq xmm0, xmm5 1.349 + psrldq xmm5, 8 1.350 + 1.351 + paddw xmm0, xmm5 1.352 + movd [rdi], xmm0 1.353 +;- 1.354 + movq xmm0, xmm6 1.355 + psrldq xmm6, 8 1.356 + 1.357 + paddw xmm0, xmm6 1.358 + movd [rdi+4], xmm0 1.359 +;- 1.360 + movq xmm0, xmm7 1.361 + psrldq xmm7, 8 1.362 + 1.363 + paddw xmm0, xmm7 1.364 + movd [rdi+8], xmm0 1.365 + 1.366 + ; begin epilog 1.367 + pop rcx 1.368 + pop rdi 1.369 + pop rsi 1.370 + RESTORE_XMM 1.371 + UNSHADOW_ARGS 1.372 + pop rbp 1.373 + ret