1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/subtract_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,245 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp8_subtract_b_sse2_impl(unsigned char *z, int src_stride, 1.18 +; short *diff, unsigned char *Predictor, 1.19 +; int pitch); 1.20 +global sym(vp8_subtract_b_sse2_impl) PRIVATE 1.21 +sym(vp8_subtract_b_sse2_impl): 1.22 + push rbp 1.23 + mov rbp, rsp 1.24 + SHADOW_ARGS_TO_STACK 5 1.25 + GET_GOT rbx 1.26 + push rsi 1.27 + push rdi 1.28 + ; end prolog 1.29 + 1.30 + mov rdi, arg(2) ;diff 1.31 + mov rax, arg(3) ;Predictor 1.32 + mov rsi, arg(0) ;z 1.33 + movsxd rdx, dword ptr arg(1);src_stride; 1.34 + movsxd rcx, dword ptr arg(4);pitch 1.35 + pxor mm7, mm7 1.36 + 1.37 + movd mm0, [rsi] 1.38 + movd mm1, [rax] 1.39 + punpcklbw mm0, mm7 1.40 + punpcklbw mm1, mm7 1.41 + psubw mm0, mm1 1.42 + movq MMWORD PTR [rdi], mm0 1.43 + 1.44 + movd mm0, [rsi+rdx] 1.45 + movd mm1, [rax+rcx] 1.46 + punpcklbw mm0, mm7 1.47 + punpcklbw mm1, mm7 1.48 + psubw mm0, mm1 1.49 + movq MMWORD PTR [rdi+rcx*2], mm0 1.50 + 1.51 + movd mm0, [rsi+rdx*2] 1.52 + movd mm1, [rax+rcx*2] 1.53 + punpcklbw mm0, mm7 1.54 + punpcklbw mm1, mm7 1.55 + psubw mm0, mm1 1.56 + movq MMWORD PTR [rdi+rcx*4], mm0 1.57 + 1.58 + lea rsi, [rsi+rdx*2] 1.59 + lea rcx, [rcx+rcx*2] 1.60 + 1.61 + movd mm0, [rsi+rdx] 1.62 + movd mm1, [rax+rcx] 1.63 + punpcklbw mm0, mm7 1.64 + punpcklbw mm1, mm7 1.65 + psubw mm0, mm1 1.66 + movq MMWORD PTR [rdi+rcx*2], mm0 1.67 + 1.68 + ; begin epilog 1.69 + pop rdi 1.70 + pop rsi 1.71 + RESTORE_GOT 1.72 + UNSHADOW_ARGS 1.73 + pop rbp 1.74 + ret 1.75 + 1.76 + 1.77 +;void vp8_subtract_mby_sse2(short *diff, unsigned char *src, int src_stride, 1.78 +;unsigned char *pred, int pred_stride) 1.79 +global sym(vp8_subtract_mby_sse2) PRIVATE 1.80 +sym(vp8_subtract_mby_sse2): 1.81 + push rbp 1.82 + mov rbp, rsp 1.83 + SHADOW_ARGS_TO_STACK 5 1.84 + GET_GOT rbx 1.85 + push rsi 1.86 + push rdi 1.87 + ; end prolog 1.88 + 1.89 + mov rdi, arg(0) ;diff 1.90 + mov rsi, arg(1) ;src 1.91 + movsxd rdx, dword ptr arg(2);src_stride 1.92 + mov rax, arg(3) ;pred 1.93 + movdqa xmm4, [GLOBAL(t80)] 1.94 + push rbx 1.95 + mov rcx, 8 ; do two lines at one time 1.96 + movsxd rbx, dword ptr arg(4);pred_stride 1.97 + 1.98 +.submby_loop: 1.99 + movdqa xmm0, [rsi] ; src 1.100 + movdqa xmm1, [rax] ; pred 1.101 + 1.102 + movdqa xmm2, xmm0 1.103 + psubb xmm0, xmm1 1.104 + 1.105 + pxor xmm1, xmm4 ;convert to signed values 1.106 + pxor xmm2, xmm4 1.107 + pcmpgtb xmm1, xmm2 ; obtain sign information 1.108 + 1.109 + movdqa xmm2, xmm0 1.110 + punpcklbw xmm0, xmm1 ; put sign back to subtraction 1.111 + punpckhbw xmm2, xmm1 ; put sign back to subtraction 1.112 + 1.113 + movdqa xmm3, [rsi + rdx] 1.114 + movdqa xmm5, [rax + rbx] 1.115 + 1.116 + lea rsi, [rsi+rdx*2] 1.117 + lea rax, [rax+rbx*2] 1.118 + 1.119 + movdqa [rdi], xmm0 1.120 + movdqa [rdi +16], xmm2 1.121 + 1.122 + movdqa xmm1, xmm3 1.123 + psubb xmm3, xmm5 1.124 + 1.125 + pxor xmm5, xmm4 ;convert to signed values 1.126 + pxor xmm1, xmm4 1.127 + pcmpgtb xmm5, xmm1 ; obtain sign information 1.128 + 1.129 + movdqa xmm1, xmm3 1.130 + punpcklbw xmm3, xmm5 ; put sign back to subtraction 1.131 + punpckhbw xmm1, xmm5 ; put sign back to subtraction 1.132 + 1.133 + movdqa [rdi +32], xmm3 1.134 + movdqa [rdi +48], xmm1 1.135 + 1.136 + add rdi, 64 1.137 + dec rcx 1.138 + jnz .submby_loop 1.139 + 1.140 + pop rbx 1.141 + pop rdi 1.142 + pop rsi 1.143 + ; begin epilog 1.144 + RESTORE_GOT 1.145 + UNSHADOW_ARGS 1.146 + pop rbp 1.147 + ret 1.148 + 1.149 +;vp8_subtract_mbuv_sse2(short *diff, unsigned char *usrc, unsigned char *vsrc, 1.150 +; int src_stride, unsigned char *upred, 1.151 +; unsigned char *vpred, int pred_stride) 1.152 +global sym(vp8_subtract_mbuv_sse2) PRIVATE 1.153 +sym(vp8_subtract_mbuv_sse2): 1.154 + push rbp 1.155 + mov rbp, rsp 1.156 + SHADOW_ARGS_TO_STACK 7 1.157 + GET_GOT rbx 1.158 + push rsi 1.159 + push rdi 1.160 + ; end prolog 1.161 + 1.162 + movdqa xmm4, [GLOBAL(t80)] 1.163 + mov rdi, arg(0) ;diff 1.164 + mov rsi, arg(1) ;usrc 1.165 + movsxd rdx, dword ptr arg(3);src_stride; 1.166 + mov rax, arg(4) ;upred 1.167 + add rdi, 256*2 ;diff = diff + 256 (shorts) 1.168 + mov rcx, 4 1.169 + push rbx 1.170 + movsxd rbx, dword ptr arg(6);pred_stride 1.171 + 1.172 + ;u 1.173 +.submbu_loop: 1.174 + movq xmm0, [rsi] ; src 1.175 + movq xmm2, [rsi+rdx] ; src -- next line 1.176 + movq xmm1, [rax] ; pred 1.177 + movq xmm3, [rax+rbx] ; pred -- next line 1.178 + lea rsi, [rsi + rdx*2] 1.179 + lea rax, [rax + rbx*2] 1.180 + 1.181 + punpcklqdq xmm0, xmm2 1.182 + punpcklqdq xmm1, xmm3 1.183 + 1.184 + movdqa xmm2, xmm0 1.185 + psubb xmm0, xmm1 ; subtraction with sign missed 1.186 + 1.187 + pxor xmm1, xmm4 ;convert to signed values 1.188 + pxor xmm2, xmm4 1.189 + pcmpgtb xmm1, xmm2 ; obtain sign information 1.190 + 1.191 + movdqa xmm2, xmm0 1.192 + movdqa xmm3, xmm1 1.193 + punpcklbw xmm0, xmm1 ; put sign back to subtraction 1.194 + punpckhbw xmm2, xmm3 ; put sign back to subtraction 1.195 + 1.196 + movdqa [rdi], xmm0 ; store difference 1.197 + movdqa [rdi +16], xmm2 ; store difference 1.198 + add rdi, 32 1.199 + sub rcx, 1 1.200 + jnz .submbu_loop 1.201 + 1.202 + mov rsi, arg(2) ;vsrc 1.203 + mov rax, arg(5) ;vpred 1.204 + mov rcx, 4 1.205 + 1.206 + ;v 1.207 +.submbv_loop: 1.208 + movq xmm0, [rsi] ; src 1.209 + movq xmm2, [rsi+rdx] ; src -- next line 1.210 + movq xmm1, [rax] ; pred 1.211 + movq xmm3, [rax+rbx] ; pred -- next line 1.212 + lea rsi, [rsi + rdx*2] 1.213 + lea rax, [rax + rbx*2] 1.214 + 1.215 + punpcklqdq xmm0, xmm2 1.216 + punpcklqdq xmm1, xmm3 1.217 + 1.218 + movdqa xmm2, xmm0 1.219 + psubb xmm0, xmm1 ; subtraction with sign missed 1.220 + 1.221 + pxor xmm1, xmm4 ;convert to signed values 1.222 + pxor xmm2, xmm4 1.223 + pcmpgtb xmm1, xmm2 ; obtain sign information 1.224 + 1.225 + movdqa xmm2, xmm0 1.226 + movdqa xmm3, xmm1 1.227 + punpcklbw xmm0, xmm1 ; put sign back to subtraction 1.228 + punpckhbw xmm2, xmm3 ; put sign back to subtraction 1.229 + 1.230 + movdqa [rdi], xmm0 ; store difference 1.231 + movdqa [rdi +16], xmm2 ; store difference 1.232 + add rdi, 32 1.233 + sub rcx, 1 1.234 + jnz .submbv_loop 1.235 + 1.236 + pop rbx 1.237 + ; begin epilog 1.238 + pop rdi 1.239 + pop rsi 1.240 + RESTORE_GOT 1.241 + UNSHADOW_ARGS 1.242 + pop rbp 1.243 + ret 1.244 + 1.245 +SECTION_RODATA 1.246 +align 16 1.247 +t80: 1.248 + times 16 db 0x80