1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/subtract_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,223 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp8_subtract_b_mmx_impl(unsigned char *z, int src_stride, 1.18 +; short *diff, unsigned char *Predictor, 1.19 +; int pitch); 1.20 +global sym(vp8_subtract_b_mmx_impl) PRIVATE 1.21 +sym(vp8_subtract_b_mmx_impl): 1.22 + push rbp 1.23 + mov rbp, rsp 1.24 + SHADOW_ARGS_TO_STACK 5 1.25 + push rsi 1.26 + push rdi 1.27 + ; end prolog 1.28 + 1.29 + 1.30 + mov rdi, arg(2) ;diff 1.31 + mov rax, arg(3) ;Predictor 1.32 + mov rsi, arg(0) ;z 1.33 + movsxd rdx, dword ptr arg(1);src_stride; 1.34 + movsxd rcx, dword ptr arg(4);pitch 1.35 + pxor mm7, mm7 1.36 + 1.37 + movd mm0, [rsi] 1.38 + movd mm1, [rax] 1.39 + punpcklbw mm0, mm7 1.40 + punpcklbw mm1, mm7 1.41 + psubw mm0, mm1 1.42 + movq [rdi], mm0 1.43 + 1.44 + 1.45 + movd mm0, [rsi+rdx] 1.46 + movd mm1, [rax+rcx] 1.47 + punpcklbw mm0, mm7 1.48 + punpcklbw mm1, mm7 1.49 + psubw mm0, mm1 1.50 + movq [rdi+rcx*2],mm0 1.51 + 1.52 + 1.53 + movd mm0, [rsi+rdx*2] 1.54 + movd mm1, [rax+rcx*2] 1.55 + punpcklbw mm0, mm7 1.56 + punpcklbw mm1, mm7 1.57 + psubw mm0, mm1 1.58 + movq [rdi+rcx*4], mm0 1.59 + 1.60 + lea rsi, [rsi+rdx*2] 1.61 + lea rcx, [rcx+rcx*2] 1.62 + 1.63 + 1.64 + 1.65 + movd mm0, [rsi+rdx] 1.66 + movd mm1, [rax+rcx] 1.67 + punpcklbw mm0, mm7 1.68 + punpcklbw mm1, mm7 1.69 + psubw mm0, mm1 1.70 + movq [rdi+rcx*2], mm0 1.71 + 1.72 + ; begin epilog 1.73 + pop rdi 1.74 + pop rsi 1.75 + UNSHADOW_ARGS 1.76 + pop rbp 1.77 + ret 1.78 + 1.79 +;void vp8_subtract_mby_mmx(short *diff, unsigned char *src, int src_stride, 1.80 +;unsigned char *pred, int pred_stride) 1.81 +global sym(vp8_subtract_mby_mmx) PRIVATE 1.82 +sym(vp8_subtract_mby_mmx): 1.83 + push rbp 1.84 + mov rbp, rsp 1.85 + SHADOW_ARGS_TO_STACK 5 1.86 + push rsi 1.87 + push rdi 1.88 + ; end prolog 1.89 + 1.90 + mov rdi, arg(0) ;diff 1.91 + mov rsi, arg(1) ;src 1.92 + movsxd rdx, dword ptr arg(2);src_stride 1.93 + mov rax, arg(3) ;pred 1.94 + push rbx 1.95 + movsxd rbx, dword ptr arg(4);pred_stride 1.96 + 1.97 + pxor mm0, mm0 1.98 + mov rcx, 16 1.99 + 1.100 + 1.101 +.submby_loop: 1.102 + movq mm1, [rsi] 1.103 + movq mm3, [rax] 1.104 + 1.105 + movq mm2, mm1 1.106 + movq mm4, mm3 1.107 + 1.108 + punpcklbw mm1, mm0 1.109 + punpcklbw mm3, mm0 1.110 + 1.111 + punpckhbw mm2, mm0 1.112 + punpckhbw mm4, mm0 1.113 + 1.114 + psubw mm1, mm3 1.115 + psubw mm2, mm4 1.116 + 1.117 + movq [rdi], mm1 1.118 + movq [rdi+8], mm2 1.119 + 1.120 + movq mm1, [rsi+8] 1.121 + movq mm3, [rax+8] 1.122 + 1.123 + movq mm2, mm1 1.124 + movq mm4, mm3 1.125 + 1.126 + punpcklbw mm1, mm0 1.127 + punpcklbw mm3, mm0 1.128 + 1.129 + punpckhbw mm2, mm0 1.130 + punpckhbw mm4, mm0 1.131 + 1.132 + psubw mm1, mm3 1.133 + psubw mm2, mm4 1.134 + 1.135 + movq [rdi+16], mm1 1.136 + movq [rdi+24], mm2 1.137 + add rdi, 32 1.138 + lea rax, [rax+rbx] 1.139 + lea rsi, [rsi+rdx] 1.140 + dec rcx 1.141 + jnz .submby_loop 1.142 + 1.143 + pop rbx 1.144 + pop rdi 1.145 + pop rsi 1.146 + ; begin epilog 1.147 + UNSHADOW_ARGS 1.148 + pop rbp 1.149 + ret 1.150 + 1.151 + 1.152 +;vp8_subtract_mbuv_mmx(short *diff, unsigned char *usrc, unsigned char *vsrc, 1.153 +; int src_stride, unsigned char *upred, 1.154 +; unsigned char *vpred, int pred_stride) 1.155 + 1.156 +global sym(vp8_subtract_mbuv_mmx) PRIVATE 1.157 +sym(vp8_subtract_mbuv_mmx): 1.158 + push rbp 1.159 + mov rbp, rsp 1.160 + SHADOW_ARGS_TO_STACK 7 1.161 + push rsi 1.162 + push rdi 1.163 + ; end prolog 1.164 + 1.165 + mov rdi, arg(0) ;diff 1.166 + mov rsi, arg(1) ;usrc 1.167 + movsxd rdx, dword ptr arg(3);src_stride; 1.168 + mov rax, arg(4) ;upred 1.169 + add rdi, 256*2 ;diff = diff + 256 (shorts) 1.170 + mov rcx, 8 1.171 + push rbx 1.172 + movsxd rbx, dword ptr arg(6);pred_stride 1.173 + 1.174 + pxor mm7, mm7 1.175 + 1.176 +.submbu_loop: 1.177 + movq mm0, [rsi] 1.178 + movq mm1, [rax] 1.179 + movq mm3, mm0 1.180 + movq mm4, mm1 1.181 + punpcklbw mm0, mm7 1.182 + punpcklbw mm1, mm7 1.183 + punpckhbw mm3, mm7 1.184 + punpckhbw mm4, mm7 1.185 + psubw mm0, mm1 1.186 + psubw mm3, mm4 1.187 + movq [rdi], mm0 1.188 + movq [rdi+8], mm3 1.189 + add rdi, 16 1.190 + add rsi, rdx 1.191 + add rax, rbx 1.192 + 1.193 + dec rcx 1.194 + jnz .submbu_loop 1.195 + 1.196 + mov rsi, arg(2) ;vsrc 1.197 + mov rax, arg(5) ;vpred 1.198 + mov rcx, 8 1.199 + 1.200 +.submbv_loop: 1.201 + movq mm0, [rsi] 1.202 + movq mm1, [rax] 1.203 + movq mm3, mm0 1.204 + movq mm4, mm1 1.205 + punpcklbw mm0, mm7 1.206 + punpcklbw mm1, mm7 1.207 + punpckhbw mm3, mm7 1.208 + punpckhbw mm4, mm7 1.209 + psubw mm0, mm1 1.210 + psubw mm3, mm4 1.211 + movq [rdi], mm0 1.212 + movq [rdi+8], mm3 1.213 + add rdi, 16 1.214 + add rsi, rdx 1.215 + add rax, rbx 1.216 + 1.217 + dec rcx 1.218 + jnz .submbv_loop 1.219 + 1.220 + pop rbx 1.221 + ; begin epilog 1.222 + pop rdi 1.223 + pop rsi 1.224 + UNSHADOW_ARGS 1.225 + pop rbp 1.226 + ret