1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp9/common/x86/vp9_postproc_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,534 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%define VP9_FILTER_WEIGHT 128 1.18 +%define VP9_FILTER_SHIFT 7 1.19 + 1.20 +;void vp9_post_proc_down_and_across_mmx 1.21 +;( 1.22 +; unsigned char *src_ptr, 1.23 +; unsigned char *dst_ptr, 1.24 +; int src_pixels_per_line, 1.25 +; int dst_pixels_per_line, 1.26 +; int rows, 1.27 +; int cols, 1.28 +; int flimit 1.29 +;) 1.30 +global sym(vp9_post_proc_down_and_across_mmx) PRIVATE 1.31 +sym(vp9_post_proc_down_and_across_mmx): 1.32 + push rbp 1.33 + mov rbp, rsp 1.34 + SHADOW_ARGS_TO_STACK 7 1.35 + GET_GOT rbx 1.36 + push rsi 1.37 + push rdi 1.38 + ; end prolog 1.39 + 1.40 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.41 + ; move the global rd onto the stack, since we don't have enough registers 1.42 + ; to do PIC addressing 1.43 + movq mm0, [GLOBAL(rd)] 1.44 + sub rsp, 8 1.45 + movq [rsp], mm0 1.46 +%define RD [rsp] 1.47 +%else 1.48 +%define RD [GLOBAL(rd)] 1.49 +%endif 1.50 + 1.51 + push rbx 1.52 + lea rbx, [GLOBAL(Blur)] 1.53 + movd mm2, dword ptr arg(6) ;flimit 1.54 + punpcklwd mm2, mm2 1.55 + punpckldq mm2, mm2 1.56 + 1.57 + mov rsi, arg(0) ;src_ptr 1.58 + mov rdi, arg(1) ;dst_ptr 1.59 + 1.60 + movsxd rcx, DWORD PTR arg(4) ;rows 1.61 + movsxd rax, DWORD PTR arg(2) ;src_pixels_per_line ; destination pitch? 1.62 + pxor mm0, mm0 ; mm0 = 00000000 1.63 + 1.64 +.nextrow: 1.65 + 1.66 + xor rdx, rdx ; clear out rdx for use as loop counter 1.67 +.nextcol: 1.68 + 1.69 + pxor mm7, mm7 ; mm7 = 00000000 1.70 + movq mm6, [rbx + 32 ] ; mm6 = kernel 2 taps 1.71 + movq mm3, [rsi] ; mm4 = r0 p0..p7 1.72 + punpcklbw mm3, mm0 ; mm3 = p0..p3 1.73 + movq mm1, mm3 ; mm1 = p0..p3 1.74 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers 1.75 + 1.76 + movq mm6, [rbx + 48] ; mm6 = kernel 3 taps 1.77 + movq mm5, [rsi + rax] ; mm4 = r1 p0..p7 1.78 + punpcklbw mm5, mm0 ; mm5 = r1 p0..p3 1.79 + pmullw mm6, mm5 ; mm6 *= p0..p3 * kernel 3 modifiers 1.80 + paddusw mm3, mm6 ; mm3 += mm6 1.81 + 1.82 + ; thresholding 1.83 + movq mm7, mm1 ; mm7 = r0 p0..p3 1.84 + psubusw mm7, mm5 ; mm7 = r0 p0..p3 - r1 p0..p3 1.85 + psubusw mm5, mm1 ; mm5 = r1 p0..p3 - r0 p0..p3 1.86 + paddusw mm7, mm5 ; mm7 = abs(r0 p0..p3 - r1 p0..p3) 1.87 + pcmpgtw mm7, mm2 1.88 + 1.89 + movq mm6, [rbx + 64 ] ; mm6 = kernel 4 modifiers 1.90 + movq mm5, [rsi + 2*rax] ; mm4 = r2 p0..p7 1.91 + punpcklbw mm5, mm0 ; mm5 = r2 p0..p3 1.92 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers 1.93 + paddusw mm3, mm6 ; mm3 += mm5 1.94 + 1.95 + ; thresholding 1.96 + movq mm6, mm1 ; mm6 = r0 p0..p3 1.97 + psubusw mm6, mm5 ; mm6 = r0 p0..p3 - r2 p0..p3 1.98 + psubusw mm5, mm1 ; mm5 = r2 p0..p3 - r2 p0..p3 1.99 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r2 p0..p3) 1.100 + pcmpgtw mm6, mm2 1.101 + por mm7, mm6 ; accumulate thresholds 1.102 + 1.103 + 1.104 + neg rax 1.105 + movq mm6, [rbx ] ; kernel 0 taps 1.106 + movq mm5, [rsi+2*rax] ; mm4 = r-2 p0..p7 1.107 + punpcklbw mm5, mm0 ; mm5 = r-2 p0..p3 1.108 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers 1.109 + paddusw mm3, mm6 ; mm3 += mm5 1.110 + 1.111 + ; thresholding 1.112 + movq mm6, mm1 ; mm6 = r0 p0..p3 1.113 + psubusw mm6, mm5 ; mm6 = p0..p3 - r-2 p0..p3 1.114 + psubusw mm5, mm1 ; mm5 = r-2 p0..p3 - p0..p3 1.115 + paddusw mm6, mm5 ; mm6 = abs(r0 p0..p3 - r-2 p0..p3) 1.116 + pcmpgtw mm6, mm2 1.117 + por mm7, mm6 ; accumulate thresholds 1.118 + 1.119 + movq mm6, [rbx + 16] ; kernel 1 taps 1.120 + movq mm4, [rsi+rax] ; mm4 = r-1 p0..p7 1.121 + punpcklbw mm4, mm0 ; mm4 = r-1 p0..p3 1.122 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. 1.123 + paddusw mm3, mm6 ; mm3 += mm5 1.124 + 1.125 + ; thresholding 1.126 + movq mm6, mm1 ; mm6 = r0 p0..p3 1.127 + psubusw mm6, mm4 ; mm6 = p0..p3 - r-2 p0..p3 1.128 + psubusw mm4, mm1 ; mm5 = r-1 p0..p3 - p0..p3 1.129 + paddusw mm6, mm4 ; mm6 = abs(r0 p0..p3 - r-1 p0..p3) 1.130 + pcmpgtw mm6, mm2 1.131 + por mm7, mm6 ; accumulate thresholds 1.132 + 1.133 + 1.134 + paddusw mm3, RD ; mm3 += round value 1.135 + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 1.136 + 1.137 + pand mm1, mm7 ; mm1 select vals > thresh from source 1.138 + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result 1.139 + paddusw mm1, mm7 ; combination 1.140 + 1.141 + packuswb mm1, mm0 ; pack to bytes 1.142 + 1.143 + movd [rdi], mm1 ; 1.144 + neg rax ; pitch is positive 1.145 + 1.146 + 1.147 + add rsi, 4 1.148 + add rdi, 4 1.149 + add rdx, 4 1.150 + 1.151 + cmp edx, dword ptr arg(5) ;cols 1.152 + jl .nextcol 1.153 + ; done with the all cols, start the across filtering in place 1.154 + sub rsi, rdx 1.155 + sub rdi, rdx 1.156 + 1.157 + 1.158 + push rax 1.159 + xor rdx, rdx 1.160 + mov rax, [rdi-4]; 1.161 + 1.162 +.acrossnextcol: 1.163 + pxor mm7, mm7 ; mm7 = 00000000 1.164 + movq mm6, [rbx + 32 ] ; 1.165 + movq mm4, [rdi+rdx] ; mm4 = p0..p7 1.166 + movq mm3, mm4 ; mm3 = p0..p7 1.167 + punpcklbw mm3, mm0 ; mm3 = p0..p3 1.168 + movq mm1, mm3 ; mm1 = p0..p3 1.169 + pmullw mm3, mm6 ; mm3 *= kernel 2 modifiers 1.170 + 1.171 + movq mm6, [rbx + 48] 1.172 + psrlq mm4, 8 ; mm4 = p1..p7 1.173 + movq mm5, mm4 ; mm5 = p1..p7 1.174 + punpcklbw mm5, mm0 ; mm5 = p1..p4 1.175 + pmullw mm6, mm5 ; mm6 *= p1..p4 * kernel 3 modifiers 1.176 + paddusw mm3, mm6 ; mm3 += mm6 1.177 + 1.178 + ; thresholding 1.179 + movq mm7, mm1 ; mm7 = p0..p3 1.180 + psubusw mm7, mm5 ; mm7 = p0..p3 - p1..p4 1.181 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 1.182 + paddusw mm7, mm5 ; mm7 = abs(p0..p3 - p1..p4) 1.183 + pcmpgtw mm7, mm2 1.184 + 1.185 + movq mm6, [rbx + 64 ] 1.186 + psrlq mm4, 8 ; mm4 = p2..p7 1.187 + movq mm5, mm4 ; mm5 = p2..p7 1.188 + punpcklbw mm5, mm0 ; mm5 = p2..p5 1.189 + pmullw mm6, mm5 ; mm5 *= kernel 4 modifiers 1.190 + paddusw mm3, mm6 ; mm3 += mm5 1.191 + 1.192 + ; thresholding 1.193 + movq mm6, mm1 ; mm6 = p0..p3 1.194 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 1.195 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 1.196 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) 1.197 + pcmpgtw mm6, mm2 1.198 + por mm7, mm6 ; accumulate thresholds 1.199 + 1.200 + 1.201 + movq mm6, [rbx ] 1.202 + movq mm4, [rdi+rdx-2] ; mm4 = p-2..p5 1.203 + movq mm5, mm4 ; mm5 = p-2..p5 1.204 + punpcklbw mm5, mm0 ; mm5 = p-2..p1 1.205 + pmullw mm6, mm5 ; mm5 *= kernel 0 modifiers 1.206 + paddusw mm3, mm6 ; mm3 += mm5 1.207 + 1.208 + ; thresholding 1.209 + movq mm6, mm1 ; mm6 = p0..p3 1.210 + psubusw mm6, mm5 ; mm6 = p0..p3 - p1..p4 1.211 + psubusw mm5, mm1 ; mm5 = p1..p4 - p0..p3 1.212 + paddusw mm6, mm5 ; mm6 = abs(p0..p3 - p1..p4) 1.213 + pcmpgtw mm6, mm2 1.214 + por mm7, mm6 ; accumulate thresholds 1.215 + 1.216 + movq mm6, [rbx + 16] 1.217 + psrlq mm4, 8 ; mm4 = p-1..p5 1.218 + punpcklbw mm4, mm0 ; mm4 = p-1..p2 1.219 + pmullw mm6, mm4 ; mm4 *= kernel 1 modifiers. 1.220 + paddusw mm3, mm6 ; mm3 += mm5 1.221 + 1.222 + ; thresholding 1.223 + movq mm6, mm1 ; mm6 = p0..p3 1.224 + psubusw mm6, mm4 ; mm6 = p0..p3 - p1..p4 1.225 + psubusw mm4, mm1 ; mm5 = p1..p4 - p0..p3 1.226 + paddusw mm6, mm4 ; mm6 = abs(p0..p3 - p1..p4) 1.227 + pcmpgtw mm6, mm2 1.228 + por mm7, mm6 ; accumulate thresholds 1.229 + 1.230 + paddusw mm3, RD ; mm3 += round value 1.231 + psraw mm3, VP9_FILTER_SHIFT ; mm3 /= 128 1.232 + 1.233 + pand mm1, mm7 ; mm1 select vals > thresh from source 1.234 + pandn mm7, mm3 ; mm7 select vals < thresh from blurred result 1.235 + paddusw mm1, mm7 ; combination 1.236 + 1.237 + packuswb mm1, mm0 ; pack to bytes 1.238 + mov DWORD PTR [rdi+rdx-4], eax ; store previous four bytes 1.239 + movd eax, mm1 1.240 + 1.241 + add rdx, 4 1.242 + cmp edx, dword ptr arg(5) ;cols 1.243 + jl .acrossnextcol; 1.244 + 1.245 + mov DWORD PTR [rdi+rdx-4], eax 1.246 + pop rax 1.247 + 1.248 + ; done with this rwo 1.249 + add rsi,rax ; next line 1.250 + movsxd rax, dword ptr arg(3) ;dst_pixels_per_line ; destination pitch? 1.251 + add rdi,rax ; next destination 1.252 + movsxd rax, dword ptr arg(2) ;src_pixels_per_line ; destination pitch? 1.253 + 1.254 + dec rcx ; decrement count 1.255 + jnz .nextrow ; next row 1.256 + pop rbx 1.257 + 1.258 + ; begin epilog 1.259 + pop rdi 1.260 + pop rsi 1.261 + RESTORE_GOT 1.262 + UNSHADOW_ARGS 1.263 + pop rbp 1.264 + ret 1.265 +%undef RD 1.266 + 1.267 + 1.268 +;void vp9_mbpost_proc_down_mmx(unsigned char *dst, 1.269 +; int pitch, int rows, int cols,int flimit) 1.270 +extern sym(vp9_rv) 1.271 +global sym(vp9_mbpost_proc_down_mmx) PRIVATE 1.272 +sym(vp9_mbpost_proc_down_mmx): 1.273 + push rbp 1.274 + mov rbp, rsp 1.275 + SHADOW_ARGS_TO_STACK 5 1.276 + GET_GOT rbx 1.277 + push rsi 1.278 + push rdi 1.279 + ; end prolog 1.280 + 1.281 + ALIGN_STACK 16, rax 1.282 + sub rsp, 136 1.283 + 1.284 + ; unsigned char d[16][8] at [rsp] 1.285 + ; create flimit2 at [rsp+128] 1.286 + mov eax, dword ptr arg(4) ;flimit 1.287 + mov [rsp+128], eax 1.288 + mov [rsp+128+4], eax 1.289 +%define flimit2 [rsp+128] 1.290 + 1.291 +%if ABI_IS_32BIT=0 1.292 + lea r8, [GLOBAL(sym(vp9_rv))] 1.293 +%endif 1.294 + 1.295 + ;rows +=8; 1.296 + add dword ptr arg(2), 8 1.297 + 1.298 + ;for(c=0; c<cols; c+=4) 1.299 +.loop_col: 1.300 + mov rsi, arg(0) ;s 1.301 + pxor mm0, mm0 ; 1.302 + 1.303 + movsxd rax, dword ptr arg(1) ;pitch ; 1.304 + neg rax ; rax = -pitch 1.305 + 1.306 + lea rsi, [rsi + rax*8]; ; rdi = s[-pitch*8] 1.307 + neg rax 1.308 + 1.309 + 1.310 + pxor mm5, mm5 1.311 + pxor mm6, mm6 ; 1.312 + 1.313 + pxor mm7, mm7 ; 1.314 + mov rdi, rsi 1.315 + 1.316 + mov rcx, 15 ; 1.317 + 1.318 +.loop_initvar: 1.319 + movd mm1, DWORD PTR [rdi]; 1.320 + punpcklbw mm1, mm0 ; 1.321 + 1.322 + paddw mm5, mm1 ; 1.323 + pmullw mm1, mm1 ; 1.324 + 1.325 + movq mm2, mm1 ; 1.326 + punpcklwd mm1, mm0 ; 1.327 + 1.328 + punpckhwd mm2, mm0 ; 1.329 + paddd mm6, mm1 ; 1.330 + 1.331 + paddd mm7, mm2 ; 1.332 + lea rdi, [rdi+rax] ; 1.333 + 1.334 + dec rcx 1.335 + jne .loop_initvar 1.336 + ;save the var and sum 1.337 + xor rdx, rdx 1.338 +.loop_row: 1.339 + movd mm1, DWORD PTR [rsi] ; [s-pitch*8] 1.340 + movd mm2, DWORD PTR [rdi] ; [s+pitch*7] 1.341 + 1.342 + punpcklbw mm1, mm0 1.343 + punpcklbw mm2, mm0 1.344 + 1.345 + paddw mm5, mm2 1.346 + psubw mm5, mm1 1.347 + 1.348 + pmullw mm2, mm2 1.349 + movq mm4, mm2 1.350 + 1.351 + punpcklwd mm2, mm0 1.352 + punpckhwd mm4, mm0 1.353 + 1.354 + paddd mm6, mm2 1.355 + paddd mm7, mm4 1.356 + 1.357 + pmullw mm1, mm1 1.358 + movq mm2, mm1 1.359 + 1.360 + punpcklwd mm1, mm0 1.361 + psubd mm6, mm1 1.362 + 1.363 + punpckhwd mm2, mm0 1.364 + psubd mm7, mm2 1.365 + 1.366 + 1.367 + movq mm3, mm6 1.368 + pslld mm3, 4 1.369 + 1.370 + psubd mm3, mm6 1.371 + movq mm1, mm5 1.372 + 1.373 + movq mm4, mm5 1.374 + pmullw mm1, mm1 1.375 + 1.376 + pmulhw mm4, mm4 1.377 + movq mm2, mm1 1.378 + 1.379 + punpcklwd mm1, mm4 1.380 + punpckhwd mm2, mm4 1.381 + 1.382 + movq mm4, mm7 1.383 + pslld mm4, 4 1.384 + 1.385 + psubd mm4, mm7 1.386 + 1.387 + psubd mm3, mm1 1.388 + psubd mm4, mm2 1.389 + 1.390 + psubd mm3, flimit2 1.391 + psubd mm4, flimit2 1.392 + 1.393 + psrad mm3, 31 1.394 + psrad mm4, 31 1.395 + 1.396 + packssdw mm3, mm4 1.397 + packsswb mm3, mm0 1.398 + 1.399 + movd mm1, DWORD PTR [rsi+rax*8] 1.400 + 1.401 + movq mm2, mm1 1.402 + punpcklbw mm1, mm0 1.403 + 1.404 + paddw mm1, mm5 1.405 + mov rcx, rdx 1.406 + 1.407 + and rcx, 127 1.408 +%if ABI_IS_32BIT=1 && CONFIG_PIC=1 1.409 + push rax 1.410 + lea rax, [GLOBAL(sym(vp9_rv))] 1.411 + movq mm4, [rax + rcx*2] ;vp9_rv[rcx*2] 1.412 + pop rax 1.413 +%elif ABI_IS_32BIT=0 1.414 + movq mm4, [r8 + rcx*2] ;vp9_rv[rcx*2] 1.415 +%else 1.416 + movq mm4, [sym(vp9_rv) + rcx*2] 1.417 +%endif 1.418 + paddw mm1, mm4 1.419 + ;paddw xmm1, eight8s 1.420 + psraw mm1, 4 1.421 + 1.422 + packuswb mm1, mm0 1.423 + pand mm1, mm3 1.424 + 1.425 + pandn mm3, mm2 1.426 + por mm1, mm3 1.427 + 1.428 + and rcx, 15 1.429 + movd DWORD PTR [rsp+rcx*4], mm1 ;d[rcx*4] 1.430 + 1.431 + mov rcx, rdx 1.432 + sub rcx, 8 1.433 + 1.434 + and rcx, 15 1.435 + movd mm1, DWORD PTR [rsp+rcx*4] ;d[rcx*4] 1.436 + 1.437 + movd [rsi], mm1 1.438 + lea rsi, [rsi+rax] 1.439 + 1.440 + lea rdi, [rdi+rax] 1.441 + add rdx, 1 1.442 + 1.443 + cmp edx, dword arg(2) ;rows 1.444 + jl .loop_row 1.445 + 1.446 + 1.447 + add dword arg(0), 4 ; s += 4 1.448 + sub dword arg(3), 4 ; cols -= 4 1.449 + cmp dword arg(3), 0 1.450 + jg .loop_col 1.451 + 1.452 + add rsp, 136 1.453 + pop rsp 1.454 + 1.455 + ; begin epilog 1.456 + pop rdi 1.457 + pop rsi 1.458 + RESTORE_GOT 1.459 + UNSHADOW_ARGS 1.460 + pop rbp 1.461 + ret 1.462 +%undef flimit2 1.463 + 1.464 + 1.465 +;void vp9_plane_add_noise_mmx (unsigned char *start, unsigned char *noise, 1.466 +; unsigned char blackclamp[16], 1.467 +; unsigned char whiteclamp[16], 1.468 +; unsigned char bothclamp[16], 1.469 +; unsigned int width, unsigned int height, int pitch) 1.470 +extern sym(rand) 1.471 +global sym(vp9_plane_add_noise_mmx) PRIVATE 1.472 +sym(vp9_plane_add_noise_mmx): 1.473 + push rbp 1.474 + mov rbp, rsp 1.475 + SHADOW_ARGS_TO_STACK 8 1.476 + GET_GOT rbx 1.477 + push rsi 1.478 + push rdi 1.479 + ; end prolog 1.480 + 1.481 +.addnoise_loop: 1.482 + call sym(rand) WRT_PLT 1.483 + mov rcx, arg(1) ;noise 1.484 + and rax, 0xff 1.485 + add rcx, rax 1.486 + 1.487 + ; we rely on the fact that the clamping vectors are stored contiguously 1.488 + ; in black/white/both order. Note that we have to reload this here because 1.489 + ; rdx could be trashed by rand() 1.490 + mov rdx, arg(2) ; blackclamp 1.491 + 1.492 + 1.493 + mov rdi, rcx 1.494 + movsxd rcx, dword arg(5) ;[Width] 1.495 + mov rsi, arg(0) ;Pos 1.496 + xor rax,rax 1.497 + 1.498 +.addnoise_nextset: 1.499 + movq mm1,[rsi+rax] ; get the source 1.500 + 1.501 + psubusb mm1, [rdx] ;blackclamp ; clamp both sides so we don't outrange adding noise 1.502 + paddusb mm1, [rdx+32] ;bothclamp 1.503 + psubusb mm1, [rdx+16] ;whiteclamp 1.504 + 1.505 + movq mm2,[rdi+rax] ; get the noise for this line 1.506 + paddb mm1,mm2 ; add it in 1.507 + movq [rsi+rax],mm1 ; store the result 1.508 + 1.509 + add rax,8 ; move to the next line 1.510 + 1.511 + cmp rax, rcx 1.512 + jl .addnoise_nextset 1.513 + 1.514 + movsxd rax, dword arg(7) ; Pitch 1.515 + add arg(0), rax ; Start += Pitch 1.516 + sub dword arg(6), 1 ; Height -= 1 1.517 + jg .addnoise_loop 1.518 + 1.519 + ; begin epilog 1.520 + pop rdi 1.521 + pop rsi 1.522 + RESTORE_GOT 1.523 + UNSHADOW_ARGS 1.524 + pop rbp 1.525 + ret 1.526 + 1.527 + 1.528 +SECTION_RODATA 1.529 +align 16 1.530 +Blur: 1.531 + times 16 dw 16 1.532 + times 8 dw 64 1.533 + times 16 dw 16 1.534 + times 8 dw 0 1.535 + 1.536 +rd: 1.537 + times 4 dw 0x40