1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/postproc_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,721 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;macro in deblock functions 1.18 +%macro FIRST_2_ROWS 0 1.19 + movdqa xmm4, xmm0 1.20 + movdqa xmm6, xmm0 1.21 + movdqa xmm5, xmm1 1.22 + pavgb xmm5, xmm3 1.23 + 1.24 + ;calculate absolute value 1.25 + psubusb xmm4, xmm1 1.26 + psubusb xmm1, xmm0 1.27 + psubusb xmm6, xmm3 1.28 + psubusb xmm3, xmm0 1.29 + paddusb xmm4, xmm1 1.30 + paddusb xmm6, xmm3 1.31 + 1.32 + ;get threshold 1.33 + movdqa xmm2, flimit 1.34 + pxor xmm1, xmm1 1.35 + movdqa xmm7, xmm2 1.36 + 1.37 + ;get mask 1.38 + psubusb xmm2, xmm4 1.39 + psubusb xmm7, xmm6 1.40 + pcmpeqb xmm2, xmm1 1.41 + pcmpeqb xmm7, xmm1 1.42 + por xmm7, xmm2 1.43 +%endmacro 1.44 + 1.45 +%macro SECOND_2_ROWS 0 1.46 + movdqa xmm6, xmm0 1.47 + movdqa xmm4, xmm0 1.48 + movdqa xmm2, xmm1 1.49 + pavgb xmm1, xmm3 1.50 + 1.51 + ;calculate absolute value 1.52 + psubusb xmm6, xmm2 1.53 + psubusb xmm2, xmm0 1.54 + psubusb xmm4, xmm3 1.55 + psubusb xmm3, xmm0 1.56 + paddusb xmm6, xmm2 1.57 + paddusb xmm4, xmm3 1.58 + 1.59 + pavgb xmm5, xmm1 1.60 + 1.61 + ;get threshold 1.62 + movdqa xmm2, flimit 1.63 + pxor xmm1, xmm1 1.64 + movdqa xmm3, xmm2 1.65 + 1.66 + ;get mask 1.67 + psubusb xmm2, xmm6 1.68 + psubusb xmm3, xmm4 1.69 + pcmpeqb xmm2, xmm1 1.70 + pcmpeqb xmm3, xmm1 1.71 + 1.72 + por xmm7, xmm2 1.73 + por xmm7, xmm3 1.74 + 1.75 + pavgb xmm5, xmm0 1.76 + 1.77 + ;decide if or not to use filtered value 1.78 + pand xmm0, xmm7 1.79 + pandn xmm7, xmm5 1.80 + paddusb xmm0, xmm7 1.81 +%endmacro 1.82 + 1.83 +%macro UPDATE_FLIMIT 0 1.84 + movdqa xmm2, XMMWORD PTR [rbx] 1.85 + movdqa [rsp], xmm2 1.86 + add rbx, 16 1.87 +%endmacro 1.88 + 1.89 +;void vp8_post_proc_down_and_across_mb_row_sse2 1.90 +;( 1.91 +; unsigned char *src_ptr, 1.92 +; unsigned char *dst_ptr, 1.93 +; int src_pixels_per_line, 1.94 +; int dst_pixels_per_line, 1.95 +; int cols, 1.96 +; int *flimits, 1.97 +; int size 1.98 +;) 1.99 +global sym(vp8_post_proc_down_and_across_mb_row_sse2) PRIVATE 1.100 +sym(vp8_post_proc_down_and_across_mb_row_sse2): 1.101 + push rbp 1.102 + mov rbp, rsp 1.103 + SHADOW_ARGS_TO_STACK 7 1.104 + SAVE_XMM 7 1.105 + push rbx 1.106 + push rsi 1.107 + push rdi 1.108 + ; end prolog 1.109 + ALIGN_STACK 16, rax 1.110 + sub rsp, 16 1.111 + 1.112 + ; put flimit on stack 1.113 + mov rbx, arg(5) ;flimits ptr 1.114 + UPDATE_FLIMIT 1.115 + 1.116 +%define flimit [rsp] 1.117 + 1.118 + mov rsi, arg(0) ;src_ptr 1.119 + mov rdi, arg(1) ;dst_ptr 1.120 + 1.121 + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line 1.122 + movsxd rcx, DWORD PTR arg(6) ;rows in a macroblock 1.123 +.nextrow: 1.124 + xor rdx, rdx ;col 1.125 +.nextcol: 1.126 + ;load current and next 2 rows 1.127 + movdqu xmm0, XMMWORD PTR [rsi] 1.128 + movdqu xmm1, XMMWORD PTR [rsi + rax] 1.129 + movdqu xmm3, XMMWORD PTR [rsi + 2*rax] 1.130 + 1.131 + FIRST_2_ROWS 1.132 + 1.133 + ;load above 2 rows 1.134 + neg rax 1.135 + movdqu xmm1, XMMWORD PTR [rsi + 2*rax] 1.136 + movdqu xmm3, XMMWORD PTR [rsi + rax] 1.137 + 1.138 + SECOND_2_ROWS 1.139 + 1.140 + movdqu XMMWORD PTR [rdi], xmm0 1.141 + 1.142 + neg rax ; positive stride 1.143 + add rsi, 16 1.144 + add rdi, 16 1.145 + 1.146 + add rdx, 16 1.147 + cmp edx, dword arg(4) ;cols 1.148 + jge .downdone 1.149 + UPDATE_FLIMIT 1.150 + jmp .nextcol 1.151 + 1.152 +.downdone: 1.153 + ; done with the all cols, start the across filtering in place 1.154 + sub rsi, rdx 1.155 + sub rdi, rdx 1.156 + 1.157 + mov rbx, arg(5) ; flimits 1.158 + UPDATE_FLIMIT 1.159 + 1.160 + ; dup the first byte into the left border 8 times 1.161 + movq mm1, [rdi] 1.162 + punpcklbw mm1, mm1 1.163 + punpcklwd mm1, mm1 1.164 + punpckldq mm1, mm1 1.165 + mov rdx, -8 1.166 + movq [rdi+rdx], mm1 1.167 + 1.168 + ; dup the last byte into the right border 1.169 + movsxd rdx, dword arg(4) 1.170 + movq mm1, [rdi + rdx + -1] 1.171 + punpcklbw mm1, mm1 1.172 + punpcklwd mm1, mm1 1.173 + punpckldq mm1, mm1 1.174 + movq [rdi+rdx], mm1 1.175 + 1.176 + xor rdx, rdx 1.177 + movq mm0, QWORD PTR [rdi-16]; 1.178 + movq mm1, QWORD PTR [rdi-8]; 1.179 + 1.180 +.acrossnextcol: 1.181 + movdqu xmm0, XMMWORD PTR [rdi + rdx] 1.182 + movdqu xmm1, XMMWORD PTR [rdi + rdx -2] 1.183 + movdqu xmm3, XMMWORD PTR [rdi + rdx -1] 1.184 + 1.185 + FIRST_2_ROWS 1.186 + 1.187 + movdqu xmm1, XMMWORD PTR [rdi + rdx +1] 1.188 + movdqu xmm3, XMMWORD PTR [rdi + rdx +2] 1.189 + 1.190 + SECOND_2_ROWS 1.191 + 1.192 + movq QWORD PTR [rdi+rdx-16], mm0 ; store previous 8 bytes 1.193 + movq QWORD PTR [rdi+rdx-8], mm1 ; store previous 8 bytes 1.194 + movdq2q mm0, xmm0 1.195 + psrldq xmm0, 8 1.196 + movdq2q mm1, xmm0 1.197 + 1.198 + add rdx, 16 1.199 + cmp edx, dword arg(4) ;cols 1.200 + jge .acrossdone 1.201 + UPDATE_FLIMIT 1.202 + jmp .acrossnextcol 1.203 + 1.204 +.acrossdone 1.205 + ; last 16 pixels 1.206 + movq QWORD PTR [rdi+rdx-16], mm0 1.207 + 1.208 + cmp edx, dword arg(4) 1.209 + jne .throw_last_8 1.210 + movq QWORD PTR [rdi+rdx-8], mm1 1.211 +.throw_last_8: 1.212 + ; done with this rwo 1.213 + add rsi,rax ;next src line 1.214 + mov eax, dword arg(3) ;dst_pixels_per_line 1.215 + add rdi,rax ;next destination 1.216 + mov eax, dword arg(2) ;src_pixels_per_line 1.217 + 1.218 + mov rbx, arg(5) ;flimits 1.219 + UPDATE_FLIMIT 1.220 + 1.221 + dec rcx ;decrement count 1.222 + jnz .nextrow ;next row 1.223 + 1.224 + add rsp, 16 1.225 + pop rsp 1.226 + ; begin epilog 1.227 + pop rdi 1.228 + pop rsi 1.229 + pop rbx 1.230 + RESTORE_XMM 1.231 + UNSHADOW_ARGS 1.232 + pop rbp 1.233 + ret 1.234 +%undef flimit 1.235 + 1.236 +;void vp8_mbpost_proc_down_xmm(unsigned char *dst, 1.237 +; int pitch, int rows, int cols,int flimit) 1.238 +extern sym(vp8_rv) 1.239 +global sym(vp8_mbpost_proc_down_xmm) PRIVATE 1.240 +sym(vp8_mbpost_proc_down_xmm): 1.241 + push rbp 1.242 + mov rbp, rsp 1.243 + SHADOW_ARGS_TO_STACK 5 1.244 + SAVE_XMM 7 1.245 + GET_GOT rbx 1.246 + push rsi 1.247 + push rdi 1.248 + ; end prolog 1.249 + 1.250 + ALIGN_STACK 16, rax 1.251 + sub rsp, 128+16 1.252 + 1.253 + ; unsigned char d[16][8] at [rsp] 1.254 + ; create flimit2 at [rsp+128] 1.255 + mov eax, dword ptr arg(4) ;flimit 1.256 + mov [rsp+128], eax 1.257 + mov [rsp+128+4], eax 1.258 + mov [rsp+128+8], eax 1.259 + mov [rsp+128+12], eax 1.260 +%define flimit4 [rsp+128] 1.261 + 1.262 +%if ABI_IS_32BIT=0 1.263 + lea r8, [GLOBAL(sym(vp8_rv))] 1.264 +%endif 1.265 + 1.266 + ;rows +=8; 1.267 + add dword arg(2), 8 1.268 + 1.269 + ;for(c=0; c<cols; c+=8) 1.270 +.loop_col: 1.271 + mov rsi, arg(0) ; s 1.272 + pxor xmm0, xmm0 ; 1.273 + 1.274 + movsxd rax, dword ptr arg(1) ;pitch ; 1.275 + 1.276 + ; this copies the last row down into the border 8 rows 1.277 + mov rdi, rsi 1.278 + mov rdx, arg(2) 1.279 + sub rdx, 9 1.280 + imul rdx, rax 1.281 + lea rdi, [rdi+rdx] 1.282 + movq xmm1, QWORD ptr[rdi] ; first row 1.283 + mov rcx, 8 1.284 +.init_borderd ; initialize borders 1.285 + lea rdi, [rdi + rax] 1.286 + movq [rdi], xmm1 1.287 + 1.288 + dec rcx 1.289 + jne .init_borderd 1.290 + 1.291 + neg rax ; rax = -pitch 1.292 + 1.293 + ; this copies the first row up into the border 8 rows 1.294 + mov rdi, rsi 1.295 + movq xmm1, QWORD ptr[rdi] ; first row 1.296 + mov rcx, 8 1.297 +.init_border ; initialize borders 1.298 + lea rdi, [rdi + rax] 1.299 + movq [rdi], xmm1 1.300 + 1.301 + dec rcx 1.302 + jne .init_border 1.303 + 1.304 + 1.305 + 1.306 + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 1.307 + neg rax 1.308 + 1.309 + pxor xmm5, xmm5 1.310 + pxor xmm6, xmm6 ; 1.311 + 1.312 + pxor xmm7, xmm7 ; 1.313 + mov rdi, rsi 1.314 + 1.315 + mov rcx, 15 ; 1.316 + 1.317 +.loop_initvar: 1.318 + movq xmm1, QWORD PTR [rdi]; 1.319 + punpcklbw xmm1, xmm0 ; 1.320 + 1.321 + paddw xmm5, xmm1 ; 1.322 + pmullw xmm1, xmm1 ; 1.323 + 1.324 + movdqa xmm2, xmm1 ; 1.325 + punpcklwd xmm1, xmm0 ; 1.326 + 1.327 + punpckhwd xmm2, xmm0 ; 1.328 + paddd xmm6, xmm1 ; 1.329 + 1.330 + paddd xmm7, xmm2 ; 1.331 + lea rdi, [rdi+rax] ; 1.332 + 1.333 + dec rcx 1.334 + jne .loop_initvar 1.335 + ;save the var and sum 1.336 + xor rdx, rdx 1.337 +.loop_row: 1.338 + movq xmm1, QWORD PTR [rsi] ; [s-pitch*8] 1.339 + movq xmm2, QWORD PTR [rdi] ; [s+pitch*7] 1.340 + 1.341 + punpcklbw xmm1, xmm0 1.342 + punpcklbw xmm2, xmm0 1.343 + 1.344 + paddw xmm5, xmm2 1.345 + psubw xmm5, xmm1 1.346 + 1.347 + pmullw xmm2, xmm2 1.348 + movdqa xmm4, xmm2 1.349 + 1.350 + punpcklwd xmm2, xmm0 1.351 + punpckhwd xmm4, xmm0 1.352 + 1.353 + paddd xmm6, xmm2 1.354 + paddd xmm7, xmm4 1.355 + 1.356 + pmullw xmm1, xmm1 1.357 + movdqa xmm2, xmm1 1.358 + 1.359 + punpcklwd xmm1, xmm0 1.360 + psubd xmm6, xmm1 1.361 + 1.362 + punpckhwd xmm2, xmm0 1.363 + psubd xmm7, xmm2 1.364 + 1.365 + 1.366 + movdqa xmm3, xmm6 1.367 + pslld xmm3, 4 1.368 + 1.369 + psubd xmm3, xmm6 1.370 + movdqa xmm1, xmm5 1.371 + 1.372 + movdqa xmm4, xmm5 1.373 + pmullw xmm1, xmm1 1.374 + 1.375 + pmulhw xmm4, xmm4 1.376 + movdqa xmm2, xmm1 1.377 + 1.378 + punpcklwd xmm1, xmm4 1.379 + punpckhwd xmm2, xmm4 1.380 + 1.381 + movdqa xmm4, xmm7 1.382 + pslld xmm4, 4 1.383 + 1.384 + psubd xmm4, xmm7 1.385 + 1.386 + psubd xmm3, xmm1 1.387 + psubd xmm4, xmm2 1.388 + 1.389 + psubd xmm3, flimit4 1.390 + psubd xmm4, flimit4 1.391 + 1.392 + psrad xmm3, 31 1.393 + psrad xmm4, 31 1.394 + 1.395 + packssdw xmm3, xmm4 1.396 + packsswb xmm3, xmm0 1.397 + 1.398 + movq xmm1, QWORD PTR [rsi+rax*8] 1.399 + 1.400 + movq xmm2, xmm1 1.401 + punpcklbw xmm1, xmm0 1.402 + 1.403 + paddw xmm1, xmm5 1.404 + mov rcx, rdx 1.405 + 1.406 + and rcx, 127 1.407 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.408 + push rax 1.409 + lea rax, [GLOBAL(sym(vp8_rv))] 1.410 + movdqu xmm4, [rax + rcx*2] ;vp8_rv[rcx*2] 1.411 + pop rax 1.412 +%elif ABI_IS_32BIT=0 1.413 + movdqu xmm4, [r8 + rcx*2] ;vp8_rv[rcx*2] 1.414 +%else 1.415 + movdqu xmm4, [sym(vp8_rv) + rcx*2] 1.416 +%endif 1.417 + 1.418 + paddw xmm1, xmm4 1.419 + ;paddw xmm1, eight8s 1.420 + psraw xmm1, 4 1.421 + 1.422 + packuswb xmm1, xmm0 1.423 + pand xmm1, xmm3 1.424 + 1.425 + pandn xmm3, xmm2 1.426 + por xmm1, xmm3 1.427 + 1.428 + and rcx, 15 1.429 + movq QWORD PTR [rsp + rcx*8], xmm1 ;d[rcx*8] 1.430 + 1.431 + mov rcx, rdx 1.432 + sub rcx, 8 1.433 + 1.434 + and rcx, 15 1.435 + movq mm0, [rsp + rcx*8] ;d[rcx*8] 1.436 + 1.437 + movq [rsi], mm0 1.438 + lea rsi, [rsi+rax] 1.439 + 1.440 + lea rdi, [rdi+rax] 1.441 + add rdx, 1 1.442 + 1.443 + cmp edx, dword arg(2) ;rows 1.444 + jl .loop_row 1.445 + 1.446 + add dword arg(0), 8 ; s += 8 1.447 + sub dword arg(3), 8 ; cols -= 8 1.448 + cmp dword arg(3), 0 1.449 + jg .loop_col 1.450 + 1.451 + add rsp, 128+16 1.452 + pop rsp 1.453 + 1.454 + ; begin epilog 1.455 + pop rdi 1.456 + pop rsi 1.457 + RESTORE_GOT 1.458 + RESTORE_XMM 1.459 + UNSHADOW_ARGS 1.460 + pop rbp 1.461 + ret 1.462 +%undef flimit4 1.463 + 1.464 + 1.465 +;void vp8_mbpost_proc_across_ip_xmm(unsigned char *src, 1.466 +; int pitch, int rows, int cols,int flimit) 1.467 +global sym(vp8_mbpost_proc_across_ip_xmm) PRIVATE 1.468 +sym(vp8_mbpost_proc_across_ip_xmm): 1.469 + push rbp 1.470 + mov rbp, rsp 1.471 + SHADOW_ARGS_TO_STACK 5 1.472 + SAVE_XMM 7 1.473 + GET_GOT rbx 1.474 + push rsi 1.475 + push rdi 1.476 + ; end prolog 1.477 + 1.478 + ALIGN_STACK 16, rax 1.479 + sub rsp, 16 1.480 + 1.481 + ; create flimit4 at [rsp] 1.482 + mov eax, dword ptr arg(4) ;flimit 1.483 + mov [rsp], eax 1.484 + mov [rsp+4], eax 1.485 + mov [rsp+8], eax 1.486 + mov [rsp+12], eax 1.487 +%define flimit4 [rsp] 1.488 + 1.489 + 1.490 + ;for(r=0;r<rows;r++) 1.491 +.ip_row_loop: 1.492 + 1.493 + xor rdx, rdx ;sumsq=0; 1.494 + xor rcx, rcx ;sum=0; 1.495 + mov rsi, arg(0); s 1.496 + 1.497 + 1.498 + ; dup the first byte into the left border 8 times 1.499 + movq mm1, [rsi] 1.500 + punpcklbw mm1, mm1 1.501 + punpcklwd mm1, mm1 1.502 + punpckldq mm1, mm1 1.503 + 1.504 + mov rdi, -8 1.505 + movq [rsi+rdi], mm1 1.506 + 1.507 + ; dup the last byte into the right border 1.508 + movsxd rdx, dword arg(3) 1.509 + movq mm1, [rsi + rdx + -1] 1.510 + punpcklbw mm1, mm1 1.511 + punpcklwd mm1, mm1 1.512 + punpckldq mm1, mm1 1.513 + movq [rsi+rdx], mm1 1.514 + 1.515 +.ip_var_loop: 1.516 + ;for(i=-8;i<=6;i++) 1.517 + ;{ 1.518 + ; sumsq += s[i]*s[i]; 1.519 + ; sum += s[i]; 1.520 + ;} 1.521 + movzx eax, byte [rsi+rdi] 1.522 + add ecx, eax 1.523 + mul al 1.524 + add edx, eax 1.525 + add rdi, 1 1.526 + cmp rdi, 6 1.527 + jle .ip_var_loop 1.528 + 1.529 + 1.530 + ;mov rax, sumsq 1.531 + ;movd xmm7, rax 1.532 + movd xmm7, edx 1.533 + 1.534 + ;mov rax, sum 1.535 + ;movd xmm6, rax 1.536 + movd xmm6, ecx 1.537 + 1.538 + mov rsi, arg(0) ;s 1.539 + xor rcx, rcx 1.540 + 1.541 + movsxd rdx, dword arg(3) ;cols 1.542 + add rdx, 8 1.543 + pxor mm0, mm0 1.544 + pxor mm1, mm1 1.545 + 1.546 + pxor xmm0, xmm0 1.547 +.nextcol4: 1.548 + 1.549 + movd xmm1, DWORD PTR [rsi+rcx-8] ; -8 -7 -6 -5 1.550 + movd xmm2, DWORD PTR [rsi+rcx+7] ; +7 +8 +9 +10 1.551 + 1.552 + punpcklbw xmm1, xmm0 ; expanding 1.553 + punpcklbw xmm2, xmm0 ; expanding 1.554 + 1.555 + punpcklwd xmm1, xmm0 ; expanding to dwords 1.556 + punpcklwd xmm2, xmm0 ; expanding to dwords 1.557 + 1.558 + psubd xmm2, xmm1 ; 7--8 8--7 9--6 10--5 1.559 + paddd xmm1, xmm1 ; -8*2 -7*2 -6*2 -5*2 1.560 + 1.561 + paddd xmm1, xmm2 ; 7+-8 8+-7 9+-6 10+-5 1.562 + pmaddwd xmm1, xmm2 ; squared of 7+-8 8+-7 9+-6 10+-5 1.563 + 1.564 + paddd xmm6, xmm2 1.565 + paddd xmm7, xmm1 1.566 + 1.567 + pshufd xmm6, xmm6, 0 ; duplicate the last ones 1.568 + pshufd xmm7, xmm7, 0 ; duplicate the last ones 1.569 + 1.570 + psrldq xmm1, 4 ; 8--7 9--6 10--5 0000 1.571 + psrldq xmm2, 4 ; 8--7 9--6 10--5 0000 1.572 + 1.573 + pshufd xmm3, xmm1, 3 ; 0000 8--7 8--7 8--7 squared 1.574 + pshufd xmm4, xmm2, 3 ; 0000 8--7 8--7 8--7 squared 1.575 + 1.576 + paddd xmm6, xmm4 1.577 + paddd xmm7, xmm3 1.578 + 1.579 + pshufd xmm3, xmm1, 01011111b ; 0000 0000 9--6 9--6 squared 1.580 + pshufd xmm4, xmm2, 01011111b ; 0000 0000 9--6 9--6 squared 1.581 + 1.582 + paddd xmm7, xmm3 1.583 + paddd xmm6, xmm4 1.584 + 1.585 + pshufd xmm3, xmm1, 10111111b ; 0000 0000 8--7 8--7 squared 1.586 + pshufd xmm4, xmm2, 10111111b ; 0000 0000 8--7 8--7 squared 1.587 + 1.588 + paddd xmm7, xmm3 1.589 + paddd xmm6, xmm4 1.590 + 1.591 + movdqa xmm3, xmm6 1.592 + pmaddwd xmm3, xmm3 1.593 + 1.594 + movdqa xmm5, xmm7 1.595 + pslld xmm5, 4 1.596 + 1.597 + psubd xmm5, xmm7 1.598 + psubd xmm5, xmm3 1.599 + 1.600 + psubd xmm5, flimit4 1.601 + psrad xmm5, 31 1.602 + 1.603 + packssdw xmm5, xmm0 1.604 + packsswb xmm5, xmm0 1.605 + 1.606 + movd xmm1, DWORD PTR [rsi+rcx] 1.607 + movq xmm2, xmm1 1.608 + 1.609 + punpcklbw xmm1, xmm0 1.610 + punpcklwd xmm1, xmm0 1.611 + 1.612 + paddd xmm1, xmm6 1.613 + paddd xmm1, [GLOBAL(four8s)] 1.614 + 1.615 + psrad xmm1, 4 1.616 + packssdw xmm1, xmm0 1.617 + 1.618 + packuswb xmm1, xmm0 1.619 + pand xmm1, xmm5 1.620 + 1.621 + pandn xmm5, xmm2 1.622 + por xmm5, xmm1 1.623 + 1.624 + movd [rsi+rcx-8], mm0 1.625 + movq mm0, mm1 1.626 + 1.627 + movdq2q mm1, xmm5 1.628 + psrldq xmm7, 12 1.629 + 1.630 + psrldq xmm6, 12 1.631 + add rcx, 4 1.632 + 1.633 + cmp rcx, rdx 1.634 + jl .nextcol4 1.635 + 1.636 + ;s+=pitch; 1.637 + movsxd rax, dword arg(1) 1.638 + add arg(0), rax 1.639 + 1.640 + sub dword arg(2), 1 ;rows-=1 1.641 + cmp dword arg(2), 0 1.642 + jg .ip_row_loop 1.643 + 1.644 + add rsp, 16 1.645 + pop rsp 1.646 + 1.647 + ; begin epilog 1.648 + pop rdi 1.649 + pop rsi 1.650 + RESTORE_GOT 1.651 + RESTORE_XMM 1.652 + UNSHADOW_ARGS 1.653 + pop rbp 1.654 + ret 1.655 +%undef flimit4 1.656 + 1.657 + 1.658 +;void vp8_plane_add_noise_wmt (unsigned char *Start, unsigned char *noise, 1.659 +; unsigned char blackclamp[16], 1.660 +; unsigned char whiteclamp[16], 1.661 +; unsigned char bothclamp[16], 1.662 +; unsigned int Width, unsigned int Height, int Pitch) 1.663 +extern sym(rand) 1.664 +global sym(vp8_plane_add_noise_wmt) PRIVATE 1.665 +sym(vp8_plane_add_noise_wmt): 1.666 + push rbp 1.667 + mov rbp, rsp 1.668 + SHADOW_ARGS_TO_STACK 8 1.669 + GET_GOT rbx 1.670 + push rsi 1.671 + push rdi 1.672 + ; end prolog 1.673 + 1.674 +.addnoise_loop: 1.675 + call sym(rand) WRT_PLT 1.676 + mov rcx, arg(1) ;noise 1.677 + and rax, 0xff 1.678 + add rcx, rax 1.679 + 1.680 + ; we rely on the fact that the clamping vectors are stored contiguously 1.681 + ; in black/white/both order. Note that we have to reload this here because 1.682 + ; rdx could be trashed by rand() 1.683 + mov rdx, arg(2) ; blackclamp 1.684 + 1.685 + 1.686 + mov rdi, rcx 1.687 + movsxd rcx, dword arg(5) ;[Width] 1.688 + mov rsi, arg(0) ;Pos 1.689 + xor rax,rax 1.690 + 1.691 +.addnoise_nextset: 1.692 + movdqu xmm1,[rsi+rax] ; get the source 1.693 + 1.694 + psubusb xmm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 1.695 + paddusb xmm1, [rdx+32] ;bothclamp 1.696 + psubusb xmm1, [rdx+16] ;whiteclamp 1.697 + 1.698 + movdqu xmm2,[rdi+rax] ; get the noise for this line 1.699 + paddb xmm1,xmm2 ; add it in 1.700 + movdqu [rsi+rax],xmm1 ; store the result 1.701 + 1.702 + add rax,16 ; move to the next line 1.703 + 1.704 + cmp rax, rcx 1.705 + jl .addnoise_nextset 1.706 + 1.707 + movsxd rax, dword arg(7) ; Pitch 1.708 + add arg(0), rax ; Start += Pitch 1.709 + sub dword arg(6), 1 ; Height -= 1 1.710 + jg .addnoise_loop 1.711 + 1.712 + ; begin epilog 1.713 + pop rdi 1.714 + pop rsi 1.715 + RESTORE_GOT 1.716 + UNSHADOW_ARGS 1.717 + pop rbp 1.718 + ret 1.719 + 1.720 + 1.721 +SECTION_RODATA 1.722 +align 16 1.723 +four8s: 1.724 + times 4 dd 8