1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_postproc_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,695 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp9_post_proc_down_and_across_xmm 1.18 +;( 1.19 +; unsigned char *src_ptr, 1.20 +; unsigned char *dst_ptr, 1.21 +; int src_pixels_per_line, 1.22 +; int dst_pixels_per_line, 1.23 +; int rows, 1.24 +; int cols, 1.25 +; int flimit 1.26 +;) 1.27 +global sym(vp9_post_proc_down_and_across_xmm) PRIVATE 1.28 +sym(vp9_post_proc_down_and_across_xmm): 1.29 + push rbp 1.30 + mov rbp, rsp 1.31 + SHADOW_ARGS_TO_STACK 7 1.32 + SAVE_XMM 7 1.33 + GET_GOT rbx 1.34 + push rsi 1.35 + push rdi 1.36 + ; end prolog 1.37 + 1.38 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.39 + ALIGN_STACK 16, rax 1.40 + ; move the global rd onto the stack, since we don't have enough registers 1.41 + ; to do PIC addressing 1.42 + movdqa xmm0, [GLOBAL(rd42)] 1.43 + sub rsp, 16 1.44 + movdqa [rsp], xmm0 1.45 +%define RD42 [rsp] 1.46 +%else 1.47 +%define RD42 [GLOBAL(rd42)] 1.48 +%endif 1.49 + 1.50 + 1.51 + movd xmm2, dword ptr arg(6) ;flimit 1.52 + punpcklwd xmm2, xmm2 1.53 + punpckldq xmm2, xmm2 1.54 + punpcklqdq xmm2, xmm2 1.55 + 1.56 + mov rsi, arg(0) ;src_ptr 1.57 + mov rdi, arg(1) ;dst_ptr 1.58 + 1.59 + movsxd rcx, DWORD PTR arg(4) ;rows 1.60 + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? 1.61 + pxor xmm0, xmm0 ; mm0 = 00000000 1.62 + 1.63 +.nextrow: 1.64 + 1.65 + xor rdx, rdx ; clear out rdx for use as loop counter 1.66 +.nextcol: 1.67 + movq xmm3, QWORD PTR [rsi] ; mm4 = r0 p0..p7 1.68 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 1.69 + movdqa xmm1, xmm3 ; mm1 = p0..p3 1.70 + psllw xmm3, 2 ; 1.71 + 1.72 + movq xmm5, QWORD PTR [rsi + rax] ; mm4 = r1 p0..p7 1.73 + punpcklbw xmm5, xmm0 ; mm5 = r1 p0..p3 1.74 + paddusw xmm3, xmm5 ; mm3 += mm6 1.75 + 1.76 + ; thresholding 1.77 + movdqa xmm7, xmm1 ; mm7 = r0 p0..p3 1.78 + psubusw xmm7, xmm5 ; mm7 = r0 p0..p3 - r1 p0..p3 1.79 + psubusw xmm5, xmm1 ; mm5 = r1 p0..p3 - r0 p0..p3 1.80 + paddusw xmm7, xmm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) 1.81 + pcmpgtw xmm7, xmm2 1.82 + 1.83 + movq xmm5, QWORD PTR [rsi + 2*rax] ; mm4 = r2 p0..p7 1.84 + punpcklbw xmm5, xmm0 ; mm5 = r2 p0..p3 1.85 + paddusw xmm3, xmm5 ; mm3 += mm5 1.86 + 1.87 + ; thresholding 1.88 + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 1.89 + psubusw xmm6, xmm5 ; mm6 = r0 p0..p3 - r2 p0..p3 1.90 + psubusw xmm5, xmm1 ; mm5 = r2 p0..p3 - r2 p0..p3 1.91 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) 1.92 + pcmpgtw xmm6, xmm2 1.93 + por xmm7, xmm6 ; accumulate thresholds 1.94 + 1.95 + 1.96 + neg rax 1.97 + movq xmm5, QWORD PTR [rsi+2*rax] ; mm4 = r-2 p0..p7 1.98 + punpcklbw xmm5, xmm0 ; mm5 = r-2 p0..p3 1.99 + paddusw xmm3, xmm5 ; mm3 += mm5 1.100 + 1.101 + ; thresholding 1.102 + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 1.103 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - r-2 p0..p3 1.104 + psubusw xmm5, xmm1 ; mm5 = r-2 p0..p3 - p0..p3 1.105 + paddusw xmm6, xmm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) 1.106 + pcmpgtw xmm6, xmm2 1.107 + por xmm7, xmm6 ; accumulate thresholds 1.108 + 1.109 + movq xmm4, QWORD PTR [rsi+rax] ; mm4 = r-1 p0..p7 1.110 + punpcklbw xmm4, xmm0 ; mm4 = r-1 p0..p3 1.111 + paddusw xmm3, xmm4 ; mm3 += mm5 1.112 + 1.113 + ; thresholding 1.114 + movdqa xmm6, xmm1 ; mm6 = r0 p0..p3 1.115 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - r-2 p0..p3 1.116 + psubusw xmm4, xmm1 ; mm5 = r-1 p0..p3 - p0..p3 1.117 + paddusw xmm6, xmm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) 1.118 + pcmpgtw xmm6, xmm2 1.119 + por xmm7, xmm6 ; accumulate thresholds 1.120 + 1.121 + 1.122 + paddusw xmm3, RD42 ; mm3 += round value 1.123 + psraw xmm3, 3 ; mm3 /= 8 1.124 + 1.125 + pand xmm1, xmm7 ; mm1 select vals > thresh from source 1.126 + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result 1.127 + paddusw xmm1, xmm7 ; combination 1.128 + 1.129 + packuswb xmm1, xmm0 ; pack to bytes 1.130 + movq QWORD PTR [rdi], xmm1 ; 1.131 + 1.132 + neg rax ; pitch is positive 1.133 + add rsi, 8 1.134 + add rdi, 8 1.135 + 1.136 + add rdx, 8 1.137 + cmp edx, dword arg(5) ;cols 1.138 + 1.139 + jl .nextcol 1.140 + 1.141 + ; done with the all cols, start the across filtering in place 1.142 + sub rsi, rdx 1.143 + sub rdi, rdx 1.144 + 1.145 + xor rdx, rdx 1.146 + movq mm0, QWORD PTR [rdi-8]; 1.147 + 1.148 +.acrossnextcol: 1.149 + movq xmm7, QWORD PTR [rdi +rdx -2] 1.150 + movd xmm4, DWORD PTR [rdi +rdx +6] 1.151 + 1.152 + pslldq xmm4, 8 1.153 + por xmm4, xmm7 1.154 + 1.155 + movdqa xmm3, xmm4 1.156 + psrldq xmm3, 2 1.157 + punpcklbw xmm3, xmm0 ; mm3 = p0..p3 1.158 + movdqa xmm1, xmm3 ; mm1 = p0..p3 1.159 + psllw xmm3, 2 1.160 + 1.161 + 1.162 + movdqa xmm5, xmm4 1.163 + psrldq xmm5, 3 1.164 + punpcklbw xmm5, xmm0 ; mm5 = p1..p4 1.165 + paddusw xmm3, xmm5 ; mm3 += mm6 1.166 + 1.167 + ; thresholding 1.168 + movdqa xmm7, xmm1 ; mm7 = p0..p3 1.169 + psubusw xmm7, xmm5 ; mm7 = p0..p3 - p1..p4 1.170 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 1.171 + paddusw xmm7, xmm5 ; mm7 = abs(p0..p3 - p1..p4) 1.172 + pcmpgtw xmm7, xmm2 1.173 + 1.174 + movdqa xmm5, xmm4 1.175 + psrldq xmm5, 4 1.176 + punpcklbw xmm5, xmm0 ; mm5 = p2..p5 1.177 + paddusw xmm3, xmm5 ; mm3 += mm5 1.178 + 1.179 + ; thresholding 1.180 + movdqa xmm6, xmm1 ; mm6 = p0..p3 1.181 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 1.182 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 1.183 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) 1.184 + pcmpgtw xmm6, xmm2 1.185 + por xmm7, xmm6 ; accumulate thresholds 1.186 + 1.187 + 1.188 + movdqa xmm5, xmm4 ; mm5 = p-2..p5 1.189 + punpcklbw xmm5, xmm0 ; mm5 = p-2..p1 1.190 + paddusw xmm3, xmm5 ; mm3 += mm5 1.191 + 1.192 + ; thresholding 1.193 + movdqa xmm6, xmm1 ; mm6 = p0..p3 1.194 + psubusw xmm6, xmm5 ; mm6 = p0..p3 - p1..p4 1.195 + psubusw xmm5, xmm1 ; mm5 = p1..p4 - p0..p3 1.196 + paddusw xmm6, xmm5 ; mm6 = abs(p0..p3 - p1..p4) 1.197 + pcmpgtw xmm6, xmm2 1.198 + por xmm7, xmm6 ; accumulate thresholds 1.199 + 1.200 + psrldq xmm4, 1 ; mm4 = p-1..p5 1.201 + punpcklbw xmm4, xmm0 ; mm4 = p-1..p2 1.202 + paddusw xmm3, xmm4 ; mm3 += mm5 1.203 + 1.204 + ; thresholding 1.205 + movdqa xmm6, xmm1 ; mm6 = p0..p3 1.206 + psubusw xmm6, xmm4 ; mm6 = p0..p3 - p1..p4 1.207 + psubusw xmm4, xmm1 ; mm5 = p1..p4 - p0..p3 1.208 + paddusw xmm6, xmm4 ; mm6 = abs(p0..p3 - p1..p4) 1.209 + pcmpgtw xmm6, xmm2 1.210 + por xmm7, xmm6 ; accumulate thresholds 1.211 + 1.212 + paddusw xmm3, RD42 ; mm3 += round value 1.213 + psraw xmm3, 3 ; mm3 /= 8 1.214 + 1.215 + pand xmm1, xmm7 ; mm1 select vals > thresh from source 1.216 + pandn xmm7, xmm3 ; mm7 select vals < thresh from blurred result 1.217 + paddusw xmm1, xmm7 ; combination 1.218 + 1.219 + packuswb xmm1, xmm0 ; pack to bytes 1.220 + movq QWORD PTR [rdi+rdx-8], mm0 ; store previous four bytes 1.221 + movdq2q mm0, xmm1 1.222 + 1.223 + add rdx, 8 1.224 + cmp edx, dword arg(5) ;cols 1.225 + jl .acrossnextcol; 1.226 + 1.227 + ; last 8 pixels 1.228 + movq QWORD PTR [rdi+rdx-8], mm0 1.229 + 1.230 + ; done with this rwo 1.231 + add rsi,rax ; next line 1.232 + mov eax, dword arg(3) ;dst_pixels_per_line ; destination pitch? 1.233 + add rdi,rax ; next destination 1.234 + mov eax, dword arg(2) ;src_pixels_per_line ; destination pitch? 1.235 + 1.236 + dec rcx ; decrement count 1.237 + jnz .nextrow ; next row 1.238 + 1.239 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.240 + add rsp,16 1.241 + pop rsp 1.242 +%endif 1.243 + ; begin epilog 1.244 + pop rdi 1.245 + pop rsi 1.246 + RESTORE_GOT 1.247 + RESTORE_XMM 1.248 + UNSHADOW_ARGS 1.249 + pop rbp 1.250 + ret 1.251 +%undef RD42 1.252 + 1.253 + 1.254 +;void vp9_mbpost_proc_down_xmm(unsigned char *dst, 1.255 +; int pitch, int rows, int cols,int flimit) 1.256 +extern sym(vp9_rv) 1.257 +global sym(vp9_mbpost_proc_down_xmm) PRIVATE 1.258 +sym(vp9_mbpost_proc_down_xmm): 1.259 + push rbp 1.260 + mov rbp, rsp 1.261 + SHADOW_ARGS_TO_STACK 5 1.262 + SAVE_XMM 7 1.263 + GET_GOT rbx 1.264 + push rsi 1.265 + push rdi 1.266 + ; end prolog 1.267 + 1.268 + ALIGN_STACK 16, rax 1.269 + sub rsp, 128+16 1.270 + 1.271 + ; unsigned char d[16][8] at [rsp] 1.272 + ; create flimit2 at [rsp+128] 1.273 + mov eax, dword ptr arg(4) ;flimit 1.274 + mov [rsp+128], eax 1.275 + mov [rsp+128+4], eax 1.276 + mov [rsp+128+8], eax 1.277 + mov [rsp+128+12], eax 1.278 +%define flimit4 [rsp+128] 1.279 + 1.280 +%if ABI_IS_32BIT=0 1.281 + lea r8, [GLOBAL(sym(vp9_rv))] 1.282 +%endif 1.283 + 1.284 + ;rows +=8; 1.285 + add dword arg(2), 8 1.286 + 1.287 + ;for(c=0; c<cols; c+=8) 1.288 +.loop_col: 1.289 + mov rsi, arg(0) ; s 1.290 + pxor xmm0, xmm0 ; 1.291 + 1.292 + movsxd rax, dword ptr arg(1) ;pitch ; 1.293 + neg rax ; rax = -pitch 1.294 + 1.295 + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 1.296 + neg rax 1.297 + 1.298 + 1.299 + pxor xmm5, xmm5 1.300 + pxor xmm6, xmm6 ; 1.301 + 1.302 + pxor xmm7, xmm7 ; 1.303 + mov rdi, rsi 1.304 + 1.305 + mov rcx, 15 ; 1.306 + 1.307 +.loop_initvar: 1.308 + movq xmm1, QWORD PTR [rdi]; 1.309 + punpcklbw xmm1, xmm0 ; 1.310 + 1.311 + paddw xmm5, xmm1 ; 1.312 + pmullw xmm1, xmm1 ; 1.313 + 1.314 + movdqa xmm2, xmm1 ; 1.315 + punpcklwd xmm1, xmm0 ; 1.316 + 1.317 + punpckhwd xmm2, xmm0 ; 1.318 + paddd xmm6, xmm1 ; 1.319 + 1.320 + paddd xmm7, xmm2 ; 1.321 + lea rdi, [rdi+rax] ; 1.322 + 1.323 + dec rcx 1.324 + jne .loop_initvar 1.325 + ;save the var and sum 1.326 + xor rdx, rdx 1.327 +.loop_row: 1.328 + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] 1.329 + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] 1.330 + 1.331 + punpcklbw xmm1, xmm0 1.332 + punpcklbw xmm2, xmm0 1.333 + 1.334 + paddw xmm5, xmm2 1.335 + psubw xmm5, xmm1 1.336 + 1.337 + pmullw xmm2, xmm2 1.338 + movdqa xmm4, xmm2 1.339 + 1.340 + punpcklwd xmm2, xmm0 1.341 + punpckhwd xmm4, xmm0 1.342 + 1.343 + paddd xmm6, xmm2 1.344 + paddd xmm7, xmm4 1.345 + 1.346 + pmullw xmm1, xmm1 1.347 + movdqa xmm2, xmm1 1.348 + 1.349 + punpcklwd xmm1, xmm0 1.350 + psubd xmm6, xmm1 1.351 + 1.352 + punpckhwd xmm2, xmm0 1.353 + psubd xmm7, xmm2 1.354 + 1.355 + 1.356 + movdqa xmm3, xmm6 1.357 + pslld xmm3, 4 1.358 + 1.359 + psubd xmm3, xmm6 1.360 + movdqa xmm1, xmm5 1.361 + 1.362 + movdqa xmm4, xmm5 1.363 + pmullw xmm1, xmm1 1.364 + 1.365 + pmulhw xmm4, xmm4 1.366 + movdqa xmm2, xmm1 1.367 + 1.368 + punpcklwd xmm1, xmm4 1.369 + punpckhwd xmm2, xmm4 1.370 + 1.371 + movdqa xmm4, xmm7 1.372 + pslld xmm4, 4 1.373 + 1.374 + psubd xmm4, xmm7 1.375 + 1.376 + psubd xmm3, xmm1 1.377 + psubd xmm4, xmm2 1.378 + 1.379 + psubd xmm3, flimit4 1.380 + psubd xmm4, flimit4 1.381 + 1.382 + psrad xmm3, 31 1.383 + psrad xmm4, 31 1.384 + 1.385 + packssdw xmm3, xmm4 1.386 + packsswb xmm3, xmm0 1.387 + 1.388 + movq xmm1, QWORD PTR [rsi+rax*8] 1.389 + 1.390 + movq xmm2, xmm1 1.391 + punpcklbw xmm1, xmm0 1.392 + 1.393 + paddw xmm1, xmm5 1.394 + mov rcx, rdx 1.395 + 1.396 + and rcx, 127 1.397 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.398 + push rax 1.399 + lea rax, [GLOBAL(sym(vp9_rv))] 1.400 + movdqu xmm4, [rax + rcx*2] ;vp9_rv[rcx*2] 1.401 + pop rax 1.402 +%elif ABI_IS_32BIT=0 1.403 + movdqu xmm4, [r8 + rcx*2] ;vp9_rv[rcx*2] 1.404 +%else 1.405 + movdqu xmm4, [sym(vp9_rv) + rcx*2] 1.406 +%endif 1.407 + 1.408 + paddw xmm1, xmm4 1.409 + ;paddw xmm1, eight8s 1.410 + psraw xmm1, 4 1.411 + 1.412 + packuswb xmm1, xmm0 1.413 + pand xmm1, xmm3 1.414 + 1.415 + pandn xmm3, xmm2 1.416 + por xmm1, xmm3 1.417 + 1.418 + and rcx, 15 1.419 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] 1.420 + 1.421 + mov rcx, rdx 1.422 + sub rcx, 8 1.423 + 1.424 + and rcx, 15 1.425 + movq mm0, [rsp + rcx*8] ;d[rcx*8] 1.426 + 1.427 + movq [rsi], mm0 1.428 + lea rsi, [rsi+rax] 1.429 + 1.430 + lea rdi, [rdi+rax] 1.431 + add rdx, 1 1.432 + 1.433 + cmp edx, dword arg(2) ;rows 1.434 + jl .loop_row 1.435 + 1.436 + add dword arg(0), 8 ; s += 8 1.437 + sub dword arg(3), 8 ; cols -= 8 1.438 + cmp dword arg(3), 0 1.439 + jg .loop_col 1.440 + 1.441 + add rsp, 128+16 1.442 + pop rsp 1.443 + 1.444 + ; begin epilog 1.445 + pop rdi 1.446 + pop rsi 1.447 + RESTORE_GOT 1.448 + RESTORE_XMM 1.449 + UNSHADOW_ARGS 1.450 + pop rbp 1.451 + ret 1.452 +%undef flimit4 1.453 + 1.454 + 1.455 +;void vp9_mbpost_proc_across_ip_xmm(unsigned char *src, 1.456 +; int pitch, int rows, int cols,int flimit) 1.457 +global sym(vp9_mbpost_proc_across_ip_xmm) PRIVATE 1.458 +sym(vp9_mbpost_proc_across_ip_xmm): 1.459 + push rbp 1.460 + mov rbp, rsp 1.461 + SHADOW_ARGS_TO_STACK 5 1.462 + SAVE_XMM 7 1.463 + GET_GOT rbx 1.464 + push rsi 1.465 + push rdi 1.466 + ; end prolog 1.467 + 1.468 + ALIGN_STACK 16, rax 1.469 + sub rsp, 16 1.470 + 1.471 + ; create flimit4 at [rsp] 1.472 + mov eax, dword ptr arg(4) ;flimit 1.473 + mov [rsp], eax 1.474 + mov [rsp+4], eax 1.475 + mov [rsp+8], eax 1.476 + mov [rsp+12], eax 1.477 +%define flimit4 [rsp] 1.478 + 1.479 + 1.480 + ;for(r=0;r<rows;r++) 1.481 +.ip_row_loop: 1.482 + 1.483 + xor rdx, rdx ;sumsq=0; 1.484 + xor rcx, rcx ;sum=0; 1.485 + mov rsi, arg(0); s 1.486 + mov rdi, -8 1.487 +.ip_var_loop: 1.488 + ;for(i=-8;i<=6;i++) 1.489 + ;{ 1.490 + ; sumsq += s[i]*s[i]; 1.491 + ; sum += s[i]; 1.492 + ;} 1.493 + movzx eax, byte [rsi+rdi] 1.494 + add ecx, eax 1.495 + mul al 1.496 + add edx, eax 1.497 + add rdi, 1 1.498 + cmp rdi, 6 1.499 + jle .ip_var_loop 1.500 + 1.501 + 1.502 + ;mov rax, sumsq 1.503 + ;movd xmm7, rax 1.504 + movd xmm7, edx 1.505 + 1.506 + ;mov rax, sum 1.507 + ;movd xmm6, rax 1.508 + movd xmm6, ecx 1.509 + 1.510 + mov rsi, arg(0) ;s 1.511 + xor rcx, rcx 1.512 + 1.513 + movsxd rdx, dword arg(3) ;cols 1.514 + add rdx, 8 1.515 + pxor mm0, mm0 1.516 + pxor mm1, mm1 1.517 + 1.518 + pxor xmm0, xmm0 1.519 +.nextcol4: 1.520 + 1.521 + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 1.522 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 1.523 + 1.524 + punpcklbw xmm1, xmm0 ; expanding 1.525 + punpcklbw xmm2, xmm0 ; expanding 1.526 + 1.527 + punpcklwd xmm1, xmm0 ; expanding to dwords 1.528 + punpcklwd xmm2, xmm0 ; expanding to dwords 1.529 + 1.530 + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 1.531 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 1.532 + 1.533 + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 1.534 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 1.535 + 1.536 + paddd xmm6, xmm2 1.537 + paddd xmm7, xmm1 1.538 + 1.539 + pshufd xmm6, xmm6, 0 ; duplicate the last ones 1.540 + pshufd xmm7, xmm7, 0 ; duplicate the last ones 1.541 + 1.542 + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 1.543 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 1.544 + 1.545 + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared 1.546 + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared 1.547 + 1.548 + paddd xmm6, xmm4 1.549 + paddd xmm7, xmm3 1.550 + 1.551 + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared 1.552 + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared 1.553 + 1.554 + paddd xmm7, xmm3 1.555 + paddd xmm6, xmm4 1.556 + 1.557 + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared 1.558 + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared 1.559 + 1.560 + paddd xmm7, xmm3 1.561 + paddd xmm6, xmm4 1.562 + 1.563 + movdqa xmm3, xmm6 1.564 + pmaddwd xmm3, xmm3 1.565 + 1.566 + movdqa xmm5, xmm7 1.567 + pslld xmm5, 4 1.568 + 1.569 + psubd xmm5, xmm7 1.570 + psubd xmm5, xmm3 1.571 + 1.572 + psubd xmm5, flimit4 1.573 + psrad xmm5, 31 1.574 + 1.575 + packssdw xmm5, xmm0 1.576 + packsswb xmm5, xmm0 1.577 + 1.578 + movd xmm1, DWORD PTR [rsi+rcx] 1.579 + movq xmm2, xmm1 1.580 + 1.581 + punpcklbw xmm1, xmm0 1.582 + punpcklwd xmm1, xmm0 1.583 + 1.584 + paddd xmm1, xmm6 1.585 + paddd xmm1, [GLOBAL(four8s)] 1.586 + 1.587 + psrad xmm1, 4 1.588 + packssdw xmm1, xmm0 1.589 + 1.590 + packuswb xmm1, xmm0 1.591 + pand xmm1, xmm5 1.592 + 1.593 + pandn xmm5, xmm2 1.594 + por xmm5, xmm1 1.595 + 1.596 + movd [rsi+rcx-8], mm0 1.597 + movq mm0, mm1 1.598 + 1.599 + movdq2q mm1, xmm5 1.600 + psrldq xmm7, 12 1.601 + 1.602 + psrldq xmm6, 12 1.603 + add rcx, 4 1.604 + 1.605 + cmp rcx, rdx 1.606 + jl .nextcol4 1.607 + 1.608 + ;s+=pitch; 1.609 + movsxd rax, dword arg(1) 1.610 + add arg(0), rax 1.611 + 1.612 + sub dword arg(2), 1 ;rows-=1 1.613 + cmp dword arg(2), 0 1.614 + jg .ip_row_loop 1.615 + 1.616 + add rsp, 16 1.617 + pop rsp 1.618 + 1.619 + ; begin epilog 1.620 + pop rdi 1.621 + pop rsi 1.622 + RESTORE_GOT 1.623 + RESTORE_XMM 1.624 + UNSHADOW_ARGS 1.625 + pop rbp 1.626 + ret 1.627 +%undef flimit4 1.628 + 1.629 + 1.630 +;void vp9_plane_add_noise_wmt (unsigned char *start, unsigned char *noise, 1.631 +; unsigned char blackclamp[16], 1.632 +; unsigned char whiteclamp[16], 1.633 +; unsigned char bothclamp[16], 1.634 +; unsigned int width, unsigned int height, int pitch) 1.635 +extern sym(rand) 1.636 +global sym(vp9_plane_add_noise_wmt) PRIVATE 1.637 +sym(vp9_plane_add_noise_wmt): 1.638 + push rbp 1.639 + mov rbp, rsp 1.640 + SHADOW_ARGS_TO_STACK 8 1.641 + GET_GOT rbx 1.642 + push rsi 1.643 + push rdi 1.644 + ; end prolog 1.645 + 1.646 +.addnoise_loop: 1.647 + call sym(rand) WRT_PLT 1.648 + mov rcx, arg(1) ;noise 1.649 + and rax, 0xff 1.650 + add rcx, rax 1.651 + 1.652 + ; we rely on the fact that the clamping vectors are stored contiguously 1.653 + ; in black/white/both order. Note that we have to reload this here because 1.654 + ; rdx could be trashed by rand() 1.655 + mov rdx, arg(2) ; blackclamp 1.656 + 1.657 + 1.658 + mov rdi, rcx 1.659 + movsxd rcx, dword arg(5) ;[Width] 1.660 + mov rsi, arg(0) ;Pos 1.661 + xor rax,rax 1.662 + 1.663 +.addnoise_nextset: 1.664 + movdqu xmm1,[rsi+rax] ; get the source 1.665 + 1.666 + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 1.667 + paddusb xmm1, [rdx+32] ;bothclamp 1.668 + psubusb xmm1, [rdx+16] ;whiteclamp 1.669 + 1.670 + movdqu xmm2,[rdi+rax] ; get the noise for this line 1.671 + paddb xmm1,xmm2 ; add it in 1.672 + movdqu [rsi+rax],xmm1 ; store the result 1.673 + 1.674 + add rax,16 ; move to the next line 1.675 + 1.676 + cmp rax, rcx 1.677 + jl .addnoise_nextset 1.678 + 1.679 + movsxd rax, dword arg(7) ; Pitch 1.680 + add arg(0), rax ; Start += Pitch 1.681 + sub dword arg(6), 1 ; Height -= 1 1.682 + jg .addnoise_loop 1.683 + 1.684 + ; begin epilog 1.685 + pop rdi 1.686 + pop rsi 1.687 + RESTORE_GOT 1.688 + UNSHADOW_ARGS 1.689 + pop rbp 1.690 + ret 1.691 + 1.692 + 1.693 +SECTION_RODATA 1.694 +align 16 1.695 +rd42: 1.696 + times 8 dw 0x04 1.697 +four8s: 1.698 + times 4 dd 8