1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/postproc_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,313 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%define VP8_FILTER_WEIGHT 128 1.18 +%define VP8_FILTER_SHIFT 7 1.19 + 1.20 +;void vp8_mbpost_proc_down_mmx(unsigned char *dst, 1.21 +; int pitch, int rows, int cols,int flimit) 1.22 +extern sym(vp8_rv) 1.23 +global sym(vp8_mbpost_proc_down_mmx) PRIVATE 1.24 +sym(vp8_mbpost_proc_down_mmx): 1.25 + push rbp 1.26 + mov rbp, rsp 1.27 + SHADOW_ARGS_TO_STACK 5 1.28 + GET_GOT rbx 1.29 + push rsi 1.30 + push rdi 1.31 + ; end prolog 1.32 + 1.33 + ALIGN_STACK 16, rax 1.34 + sub rsp, 136 1.35 + 1.36 + ; unsigned char d[16][8] at [rsp] 1.37 + ; create flimit2 at [rsp+128] 1.38 + mov eax, dword ptr arg(4) ;flimit 1.39 + mov [rsp+128], eax 1.40 + mov [rsp+128+4], eax 1.41 +%define flimit2 [rsp+128] 1.42 + 1.43 +%if ABI_IS_32BIT=0 1.44 + lea r8, [GLOBAL(sym(vp8_rv))] 1.45 +%endif 1.46 + 1.47 + ;rows +=8; 1.48 + add dword ptr arg(2), 8 1.49 + 1.50 + ;for(c=0; c<cols; c+=4) 1.51 +.loop_col: 1.52 + mov rsi, arg(0) ;s 1.53 + pxor mm0, mm0 ; 1.54 + 1.55 + movsxd rax, dword ptr arg(1) ;pitch ; 1.56 + 1.57 + ; this copies the last row down into the border 8 rows 1.58 + mov rdi, rsi 1.59 + mov rdx, arg(2) 1.60 + sub rdx, 9 1.61 + imul rdx, rax 1.62 + lea rdi, [rdi+rdx] 1.63 + movq mm1, QWORD ptr[rdi] ; first row 1.64 + mov rcx, 8 1.65 +.init_borderd ; initialize borders 1.66 + lea rdi, [rdi + rax] 1.67 + movq [rdi], mm1 1.68 + 1.69 + dec rcx 1.70 + jne .init_borderd 1.71 + 1.72 + neg rax ; rax = -pitch 1.73 + 1.74 + ; this copies the first row up into the border 8 rows 1.75 + mov rdi, rsi 1.76 + movq mm1, QWORD ptr[rdi] ; first row 1.77 + mov rcx, 8 1.78 +.init_border ; initialize borders 1.79 + lea rdi, [rdi + rax] 1.80 + movq [rdi], mm1 1.81 + 1.82 + dec rcx 1.83 + jne .init_border 1.84 + 1.85 + 1.86 + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 1.87 + neg rax 1.88 + 1.89 + 1.90 + pxor mm5, mm5 1.91 + pxor mm6, mm6 ; 1.92 + 1.93 + pxor mm7, mm7 ; 1.94 + mov rdi, rsi 1.95 + 1.96 + mov rcx, 15 ; 1.97 + 1.98 +.loop_initvar: 1.99 + movd mm1, DWORD PTR [rdi]; 1.100 + punpcklbw mm1, mm0 ; 1.101 + 1.102 + paddw mm5, mm1 ; 1.103 + pmullw mm1, mm1 ; 1.104 + 1.105 + movq mm2, mm1 ; 1.106 + punpcklwd mm1, mm0 ; 1.107 + 1.108 + punpckhwd mm2, mm0 ; 1.109 + paddd mm6, mm1 ; 1.110 + 1.111 + paddd mm7, mm2 ; 1.112 + lea rdi, [rdi+rax] ; 1.113 + 1.114 + dec rcx 1.115 + jne .loop_initvar 1.116 + ;save the var and sum 1.117 + xor rdx, rdx 1.118 +.loop_row: 1.119 + movd mm1, DWORD PTR [rsi] ; [s-pitch*8] 1.120 + movd mm2, DWORD PTR [rdi] ; [s+pitch*7] 1.121 + 1.122 + punpcklbw mm1, mm0 1.123 + punpcklbw mm2, mm0 1.124 + 1.125 + paddw mm5, mm2 1.126 + psubw mm5, mm1 1.127 + 1.128 + pmullw mm2, mm2 1.129 + movq mm4, mm2 1.130 + 1.131 + punpcklwd mm2, mm0 1.132 + punpckhwd mm4, mm0 1.133 + 1.134 + paddd mm6, mm2 1.135 + paddd mm7, mm4 1.136 + 1.137 + pmullw mm1, mm1 1.138 + movq mm2, mm1 1.139 + 1.140 + punpcklwd mm1, mm0 1.141 + psubd mm6, mm1 1.142 + 1.143 + punpckhwd mm2, mm0 1.144 + psubd mm7, mm2 1.145 + 1.146 + 1.147 + movq mm3, mm6 1.148 + pslld mm3, 4 1.149 + 1.150 + psubd mm3, mm6 1.151 + movq mm1, mm5 1.152 + 1.153 + movq mm4, mm5 1.154 + pmullw mm1, mm1 1.155 + 1.156 + pmulhw mm4, mm4 1.157 + movq mm2, mm1 1.158 + 1.159 + punpcklwd mm1, mm4 1.160 + punpckhwd mm2, mm4 1.161 + 1.162 + movq mm4, mm7 1.163 + pslld mm4, 4 1.164 + 1.165 + psubd mm4, mm7 1.166 + 1.167 + psubd mm3, mm1 1.168 + psubd mm4, mm2 1.169 + 1.170 + psubd mm3, flimit2 1.171 + psubd mm4, flimit2 1.172 + 1.173 + psrad mm3, 31 1.174 + psrad mm4, 31 1.175 + 1.176 + packssdw mm3, mm4 1.177 + packsswb mm3, mm0 1.178 + 1.179 + movd mm1, DWORD PTR [rsi+rax*8] 1.180 + 1.181 + movq mm2, mm1 1.182 + punpcklbw mm1, mm0 1.183 + 1.184 + paddw mm1, mm5 1.185 + mov rcx, rdx 1.186 + 1.187 + and rcx, 127 1.188 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.189 + push rax 1.190 + lea rax, [GLOBAL(sym(vp8_rv))] 1.191 + movq mm4, [rax + rcx*2] ;vp8_rv[rcx*2] 1.192 + pop rax 1.193 +%elif ABI_IS_32BIT=0 1.194 + movq mm4, [r8 + rcx*2] ;vp8_rv[rcx*2] 1.195 +%else 1.196 + movq mm4, [sym(vp8_rv) + rcx*2] 1.197 +%endif 1.198 + paddw mm1, mm4 1.199 + psraw mm1, 4 1.200 + 1.201 + packuswb mm1, mm0 1.202 + pand mm1, mm3 1.203 + 1.204 + pandn mm3, mm2 1.205 + por mm1, mm3 1.206 + 1.207 + and rcx, 15 1.208 + movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] 1.209 + 1.210 + mov rcx, rdx 1.211 + sub rcx, 8 1.212 + 1.213 + and rcx, 15 1.214 + movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] 1.215 + 1.216 + movd [rsi], mm1 1.217 + lea rsi, [rsi+rax] 1.218 + 1.219 + lea rdi, [rdi+rax] 1.220 + add rdx, 1 1.221 + 1.222 + cmp edx, dword arg(2) ;rows 1.223 + jl .loop_row 1.224 + 1.225 + 1.226 + add dword arg(0), 4 ; s += 4 1.227 + sub dword arg(3), 4 ; cols -= 4 1.228 + cmp dword arg(3), 0 1.229 + jg .loop_col 1.230 + 1.231 + add rsp, 136 1.232 + pop rsp 1.233 + 1.234 + ; begin epilog 1.235 + pop rdi 1.236 + pop rsi 1.237 + RESTORE_GOT 1.238 + UNSHADOW_ARGS 1.239 + pop rbp 1.240 + ret 1.241 +%undef flimit2 1.242 + 1.243 + 1.244 +;void vp8_plane_add_noise_mmx (unsigned char *Start, unsigned char *noise, 1.245 +; unsigned char blackclamp[16], 1.246 +; unsigned char whiteclamp[16], 1.247 +; unsigned char bothclamp[16], 1.248 +; unsigned int Width, unsigned int Height, int Pitch) 1.249 +extern sym(rand) 1.250 +global sym(vp8_plane_add_noise_mmx) PRIVATE 1.251 +sym(vp8_plane_add_noise_mmx): 1.252 + push rbp 1.253 + mov rbp, rsp 1.254 + SHADOW_ARGS_TO_STACK 8 1.255 + GET_GOT rbx 1.256 + push rsi 1.257 + push rdi 1.258 + ; end prolog 1.259 + 1.260 +.addnoise_loop: 1.261 + call sym(rand) WRT_PLT 1.262 + mov rcx, arg(1) ;noise 1.263 + and rax, 0xff 1.264 + add rcx, rax 1.265 + 1.266 + ; we rely on the fact that the clamping vectors are stored contiguously 1.267 + ; in black/white/both order. Note that we have to reload this here because 1.268 + ; rdx could be trashed by rand() 1.269 + mov rdx, arg(2) ; blackclamp 1.270 + 1.271 + 1.272 + mov rdi, rcx 1.273 + movsxd rcx, dword arg(5) ;[Width] 1.274 + mov rsi, arg(0) ;Pos 1.275 + xor rax,rax 1.276 + 1.277 +.addnoise_nextset: 1.278 + movq mm1,[rsi+rax] ; get the source 1.279 + 1.280 + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 1.281 + paddusb mm1, [rdx+32] ;bothclamp 1.282 + psubusb mm1, [rdx+16] ;whiteclamp 1.283 + 1.284 + movq mm2,[rdi+rax] ; get the noise for this line 1.285 + paddb mm1,mm2 ; add it in 1.286 + movq [rsi+rax],mm1 ; store the result 1.287 + 1.288 + add rax,8 ; move to the next line 1.289 + 1.290 + cmp rax, rcx 1.291 + jl .addnoise_nextset 1.292 + 1.293 + movsxd rax, dword arg(7) ; Pitch 1.294 + add arg(0), rax ; Start += Pitch 1.295 + sub dword arg(6), 1 ; Height -= 1 1.296 + jg .addnoise_loop 1.297 + 1.298 + ; begin epilog 1.299 + pop rdi 1.300 + pop rsi 1.301 + RESTORE_GOT 1.302 + UNSHADOW_ARGS 1.303 + pop rbp 1.304 + ret 1.305 + 1.306 + 1.307 +SECTION_RODATA 1.308 +align 16 1.309 +Blur: 1.310 + times 16 dw 16 1.311 + times 8 dw 64 1.312 + times 16 dw 16 1.313 + times 8 dw 0 1.314 + 1.315 +rd: 1.316 + times 4 dw 0x40