1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/idctllm_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,708 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp8_idct_dequant_0_2x_sse2 1.18 +; ( 1.19 +; short *qcoeff - 0 1.20 +; short *dequant - 1 1.21 +; unsigned char *dst - 2 1.22 +; int dst_stride - 3 1.23 +; ) 1.24 + 1.25 +global sym(vp8_idct_dequant_0_2x_sse2) PRIVATE 1.26 +sym(vp8_idct_dequant_0_2x_sse2): 1.27 + push rbp 1.28 + mov rbp, rsp 1.29 + SHADOW_ARGS_TO_STACK 4 1.30 + GET_GOT rbx 1.31 + ; end prolog 1.32 + 1.33 + mov rdx, arg(1) ; dequant 1.34 + mov rax, arg(0) ; qcoeff 1.35 + 1.36 + movd xmm4, [rax] 1.37 + movd xmm5, [rdx] 1.38 + 1.39 + pinsrw xmm4, [rax+32], 4 1.40 + pinsrw xmm5, [rdx], 4 1.41 + 1.42 + pmullw xmm4, xmm5 1.43 + 1.44 + ; Zero out xmm5, for use unpacking 1.45 + pxor xmm5, xmm5 1.46 + 1.47 + ; clear coeffs 1.48 + movd [rax], xmm5 1.49 + movd [rax+32], xmm5 1.50 +;pshufb 1.51 + mov rax, arg(2) ; dst 1.52 + movsxd rdx, dword ptr arg(3) ; dst_stride 1.53 + 1.54 + pshuflw xmm4, xmm4, 00000000b 1.55 + pshufhw xmm4, xmm4, 00000000b 1.56 + 1.57 + lea rcx, [rdx + rdx*2] 1.58 + paddw xmm4, [GLOBAL(fours)] 1.59 + 1.60 + psraw xmm4, 3 1.61 + 1.62 + movq xmm0, [rax] 1.63 + movq xmm1, [rax+rdx] 1.64 + movq xmm2, [rax+2*rdx] 1.65 + movq xmm3, [rax+rcx] 1.66 + 1.67 + punpcklbw xmm0, xmm5 1.68 + punpcklbw xmm1, xmm5 1.69 + punpcklbw xmm2, xmm5 1.70 + punpcklbw xmm3, xmm5 1.71 + 1.72 + 1.73 + ; Add to predict buffer 1.74 + paddw xmm0, xmm4 1.75 + paddw xmm1, xmm4 1.76 + paddw xmm2, xmm4 1.77 + paddw xmm3, xmm4 1.78 + 1.79 + ; pack up before storing 1.80 + packuswb xmm0, xmm5 1.81 + packuswb xmm1, xmm5 1.82 + packuswb xmm2, xmm5 1.83 + packuswb xmm3, xmm5 1.84 + 1.85 + ; store blocks back out 1.86 + movq [rax], xmm0 1.87 + movq [rax + rdx], xmm1 1.88 + 1.89 + lea rax, [rax + 2*rdx] 1.90 + 1.91 + movq [rax], xmm2 1.92 + movq [rax + rdx], xmm3 1.93 + 1.94 + ; begin epilog 1.95 + RESTORE_GOT 1.96 + UNSHADOW_ARGS 1.97 + pop rbp 1.98 + ret 1.99 + 1.100 +;void vp8_idct_dequant_full_2x_sse2 1.101 +; ( 1.102 +; short *qcoeff - 0 1.103 +; short *dequant - 1 1.104 +; unsigned char *dst - 2 1.105 +; int dst_stride - 3 1.106 +; ) 1.107 +global sym(vp8_idct_dequant_full_2x_sse2) PRIVATE 1.108 +sym(vp8_idct_dequant_full_2x_sse2): 1.109 + push rbp 1.110 + mov rbp, rsp 1.111 + SHADOW_ARGS_TO_STACK 4 1.112 + SAVE_XMM 7 1.113 + GET_GOT rbx 1.114 + push rsi 1.115 + push rdi 1.116 + ; end prolog 1.117 + 1.118 + ; special case when 2 blocks have 0 or 1 coeffs 1.119 + ; dc is set as first coeff, so no need to load qcoeff 1.120 + mov rax, arg(0) ; qcoeff 1.121 + mov rdx, arg(1) ; dequant 1.122 + mov rdi, arg(2) ; dst 1.123 + 1.124 + 1.125 + ; Zero out xmm7, for use unpacking 1.126 + pxor xmm7, xmm7 1.127 + 1.128 + 1.129 + ; note the transpose of xmm1 and xmm2, necessary for shuffle 1.130 + ; to spit out sensicle data 1.131 + movdqa xmm0, [rax] 1.132 + movdqa xmm2, [rax+16] 1.133 + movdqa xmm1, [rax+32] 1.134 + movdqa xmm3, [rax+48] 1.135 + 1.136 + ; Clear out coeffs 1.137 + movdqa [rax], xmm7 1.138 + movdqa [rax+16], xmm7 1.139 + movdqa [rax+32], xmm7 1.140 + movdqa [rax+48], xmm7 1.141 + 1.142 + ; dequantize qcoeff buffer 1.143 + pmullw xmm0, [rdx] 1.144 + pmullw xmm2, [rdx+16] 1.145 + pmullw xmm1, [rdx] 1.146 + pmullw xmm3, [rdx+16] 1.147 + movsxd rdx, dword ptr arg(3) ; dst_stride 1.148 + 1.149 + ; repack so block 0 row x and block 1 row x are together 1.150 + movdqa xmm4, xmm0 1.151 + punpckldq xmm0, xmm1 1.152 + punpckhdq xmm4, xmm1 1.153 + 1.154 + pshufd xmm0, xmm0, 11011000b 1.155 + pshufd xmm1, xmm4, 11011000b 1.156 + 1.157 + movdqa xmm4, xmm2 1.158 + punpckldq xmm2, xmm3 1.159 + punpckhdq xmm4, xmm3 1.160 + 1.161 + pshufd xmm2, xmm2, 11011000b 1.162 + pshufd xmm3, xmm4, 11011000b 1.163 + 1.164 + ; first pass 1.165 + psubw xmm0, xmm2 ; b1 = 0-2 1.166 + paddw xmm2, xmm2 ; 1.167 + 1.168 + movdqa xmm5, xmm1 1.169 + paddw xmm2, xmm0 ; a1 = 0+2 1.170 + 1.171 + pmulhw xmm5, [GLOBAL(x_s1sqr2)] 1.172 + lea rcx, [rdx + rdx*2] ;dst_stride * 3 1.173 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 1.174 + 1.175 + movdqa xmm7, xmm3 1.176 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 1.177 + 1.178 + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 1.179 + psubw xmm7, xmm5 ; c1 1.180 + 1.181 + movdqa xmm5, xmm1 1.182 + movdqa xmm4, xmm3 1.183 + 1.184 + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 1.185 + paddw xmm5, xmm1 1.186 + 1.187 + pmulhw xmm3, [GLOBAL(x_s1sqr2)] 1.188 + paddw xmm3, xmm4 1.189 + 1.190 + paddw xmm3, xmm5 ; d1 1.191 + movdqa xmm6, xmm2 ; a1 1.192 + 1.193 + movdqa xmm4, xmm0 ; b1 1.194 + paddw xmm2, xmm3 ;0 1.195 + 1.196 + paddw xmm4, xmm7 ;1 1.197 + psubw xmm0, xmm7 ;2 1.198 + 1.199 + psubw xmm6, xmm3 ;3 1.200 + 1.201 + ; transpose for the second pass 1.202 + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 1.203 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 1.204 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 1.205 + 1.206 + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 1.207 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 1.208 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 1.209 + 1.210 + 1.211 + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 1.212 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 1.213 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 1.214 + 1.215 + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 1.216 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 1.217 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 1.218 + 1.219 + 1.220 + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 1.221 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 1.222 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 1.223 + 1.224 + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 1.225 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 1.226 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 1.227 + 1.228 + pshufd xmm0, xmm2, 11011000b 1.229 + pshufd xmm2, xmm1, 11011000b 1.230 + 1.231 + pshufd xmm1, xmm5, 11011000b 1.232 + pshufd xmm3, xmm7, 11011000b 1.233 + 1.234 + ; second pass 1.235 + psubw xmm0, xmm2 ; b1 = 0-2 1.236 + paddw xmm2, xmm2 1.237 + 1.238 + movdqa xmm5, xmm1 1.239 + paddw xmm2, xmm0 ; a1 = 0+2 1.240 + 1.241 + pmulhw xmm5, [GLOBAL(x_s1sqr2)] 1.242 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 1.243 + 1.244 + movdqa xmm7, xmm3 1.245 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 1.246 + 1.247 + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 1.248 + psubw xmm7, xmm5 ; c1 1.249 + 1.250 + movdqa xmm5, xmm1 1.251 + movdqa xmm4, xmm3 1.252 + 1.253 + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 1.254 + paddw xmm5, xmm1 1.255 + 1.256 + pmulhw xmm3, [GLOBAL(x_s1sqr2)] 1.257 + paddw xmm3, xmm4 1.258 + 1.259 + paddw xmm3, xmm5 ; d1 1.260 + paddw xmm0, [GLOBAL(fours)] 1.261 + 1.262 + paddw xmm2, [GLOBAL(fours)] 1.263 + movdqa xmm6, xmm2 ; a1 1.264 + 1.265 + movdqa xmm4, xmm0 ; b1 1.266 + paddw xmm2, xmm3 ;0 1.267 + 1.268 + paddw xmm4, xmm7 ;1 1.269 + psubw xmm0, xmm7 ;2 1.270 + 1.271 + psubw xmm6, xmm3 ;3 1.272 + psraw xmm2, 3 1.273 + 1.274 + psraw xmm0, 3 1.275 + psraw xmm4, 3 1.276 + 1.277 + psraw xmm6, 3 1.278 + 1.279 + ; transpose to save 1.280 + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 1.281 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 1.282 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 1.283 + 1.284 + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 1.285 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 1.286 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 1.287 + 1.288 + 1.289 + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 1.290 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 1.291 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 1.292 + 1.293 + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 1.294 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 1.295 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 1.296 + 1.297 + 1.298 + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 1.299 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 1.300 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 1.301 + 1.302 + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 1.303 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 1.304 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 1.305 + 1.306 + pshufd xmm0, xmm2, 11011000b 1.307 + pshufd xmm2, xmm1, 11011000b 1.308 + 1.309 + pshufd xmm1, xmm5, 11011000b 1.310 + pshufd xmm3, xmm7, 11011000b 1.311 + 1.312 + pxor xmm7, xmm7 1.313 + 1.314 + ; Load up predict blocks 1.315 + movq xmm4, [rdi] 1.316 + movq xmm5, [rdi+rdx] 1.317 + 1.318 + punpcklbw xmm4, xmm7 1.319 + punpcklbw xmm5, xmm7 1.320 + 1.321 + paddw xmm0, xmm4 1.322 + paddw xmm1, xmm5 1.323 + 1.324 + movq xmm4, [rdi+2*rdx] 1.325 + movq xmm5, [rdi+rcx] 1.326 + 1.327 + punpcklbw xmm4, xmm7 1.328 + punpcklbw xmm5, xmm7 1.329 + 1.330 + paddw xmm2, xmm4 1.331 + paddw xmm3, xmm5 1.332 + 1.333 +.finish: 1.334 + 1.335 + ; pack up before storing 1.336 + packuswb xmm0, xmm7 1.337 + packuswb xmm1, xmm7 1.338 + packuswb xmm2, xmm7 1.339 + packuswb xmm3, xmm7 1.340 + 1.341 + ; store blocks back out 1.342 + movq [rdi], xmm0 1.343 + movq [rdi + rdx], xmm1 1.344 + movq [rdi + rdx*2], xmm2 1.345 + movq [rdi + rcx], xmm3 1.346 + 1.347 + ; begin epilog 1.348 + pop rdi 1.349 + pop rsi 1.350 + RESTORE_GOT 1.351 + RESTORE_XMM 1.352 + UNSHADOW_ARGS 1.353 + pop rbp 1.354 + ret 1.355 + 1.356 +;void vp8_idct_dequant_dc_0_2x_sse2 1.357 +; ( 1.358 +; short *qcoeff - 0 1.359 +; short *dequant - 1 1.360 +; unsigned char *dst - 2 1.361 +; int dst_stride - 3 1.362 +; short *dc - 4 1.363 +; ) 1.364 +global sym(vp8_idct_dequant_dc_0_2x_sse2) PRIVATE 1.365 +sym(vp8_idct_dequant_dc_0_2x_sse2): 1.366 + push rbp 1.367 + mov rbp, rsp 1.368 + SHADOW_ARGS_TO_STACK 5 1.369 + GET_GOT rbx 1.370 + push rdi 1.371 + ; end prolog 1.372 + 1.373 + ; special case when 2 blocks have 0 or 1 coeffs 1.374 + ; dc is set as first coeff, so no need to load qcoeff 1.375 + mov rax, arg(0) ; qcoeff 1.376 + 1.377 + mov rdi, arg(2) ; dst 1.378 + mov rdx, arg(4) ; dc 1.379 + 1.380 + ; Zero out xmm5, for use unpacking 1.381 + pxor xmm5, xmm5 1.382 + 1.383 + ; load up 2 dc words here == 2*16 = doubleword 1.384 + movd xmm4, [rdx] 1.385 + 1.386 + movsxd rdx, dword ptr arg(3) ; dst_stride 1.387 + lea rcx, [rdx + rdx*2] 1.388 + ; Load up predict blocks 1.389 + movq xmm0, [rdi] 1.390 + movq xmm1, [rdi+rdx*1] 1.391 + movq xmm2, [rdi+rdx*2] 1.392 + movq xmm3, [rdi+rcx] 1.393 + 1.394 + ; Duplicate and expand dc across 1.395 + punpcklwd xmm4, xmm4 1.396 + punpckldq xmm4, xmm4 1.397 + 1.398 + ; Rounding to dequant and downshift 1.399 + paddw xmm4, [GLOBAL(fours)] 1.400 + psraw xmm4, 3 1.401 + 1.402 + ; Predict buffer needs to be expanded from bytes to words 1.403 + punpcklbw xmm0, xmm5 1.404 + punpcklbw xmm1, xmm5 1.405 + punpcklbw xmm2, xmm5 1.406 + punpcklbw xmm3, xmm5 1.407 + 1.408 + ; Add to predict buffer 1.409 + paddw xmm0, xmm4 1.410 + paddw xmm1, xmm4 1.411 + paddw xmm2, xmm4 1.412 + paddw xmm3, xmm4 1.413 + 1.414 + ; pack up before storing 1.415 + packuswb xmm0, xmm5 1.416 + packuswb xmm1, xmm5 1.417 + packuswb xmm2, xmm5 1.418 + packuswb xmm3, xmm5 1.419 + 1.420 + ; store blocks back out 1.421 + movq [rdi], xmm0 1.422 + movq [rdi + rdx], xmm1 1.423 + movq [rdi + rdx*2], xmm2 1.424 + movq [rdi + rcx], xmm3 1.425 + 1.426 + ; begin epilog 1.427 + pop rdi 1.428 + RESTORE_GOT 1.429 + UNSHADOW_ARGS 1.430 + pop rbp 1.431 + ret 1.432 +;void vp8_idct_dequant_dc_full_2x_sse2 1.433 +; ( 1.434 +; short *qcoeff - 0 1.435 +; short *dequant - 1 1.436 +; unsigned char *dst - 2 1.437 +; int dst_stride - 3 1.438 +; short *dc - 4 1.439 +; ) 1.440 +global sym(vp8_idct_dequant_dc_full_2x_sse2) PRIVATE 1.441 +sym(vp8_idct_dequant_dc_full_2x_sse2): 1.442 + push rbp 1.443 + mov rbp, rsp 1.444 + SHADOW_ARGS_TO_STACK 5 1.445 + SAVE_XMM 7 1.446 + GET_GOT rbx 1.447 + push rdi 1.448 + ; end prolog 1.449 + 1.450 + ; special case when 2 blocks have 0 or 1 coeffs 1.451 + ; dc is set as first coeff, so no need to load qcoeff 1.452 + mov rax, arg(0) ; qcoeff 1.453 + mov rdx, arg(1) ; dequant 1.454 + 1.455 + mov rdi, arg(2) ; dst 1.456 + 1.457 + ; Zero out xmm7, for use unpacking 1.458 + pxor xmm7, xmm7 1.459 + 1.460 + 1.461 + ; note the transpose of xmm1 and xmm2, necessary for shuffle 1.462 + ; to spit out sensicle data 1.463 + movdqa xmm0, [rax] 1.464 + movdqa xmm2, [rax+16] 1.465 + movdqa xmm1, [rax+32] 1.466 + movdqa xmm3, [rax+48] 1.467 + 1.468 + ; Clear out coeffs 1.469 + movdqa [rax], xmm7 1.470 + movdqa [rax+16], xmm7 1.471 + movdqa [rax+32], xmm7 1.472 + movdqa [rax+48], xmm7 1.473 + 1.474 + ; dequantize qcoeff buffer 1.475 + pmullw xmm0, [rdx] 1.476 + pmullw xmm2, [rdx+16] 1.477 + pmullw xmm1, [rdx] 1.478 + pmullw xmm3, [rdx+16] 1.479 + 1.480 + ; DC component 1.481 + mov rdx, arg(4) 1.482 + 1.483 + ; repack so block 0 row x and block 1 row x are together 1.484 + movdqa xmm4, xmm0 1.485 + punpckldq xmm0, xmm1 1.486 + punpckhdq xmm4, xmm1 1.487 + 1.488 + pshufd xmm0, xmm0, 11011000b 1.489 + pshufd xmm1, xmm4, 11011000b 1.490 + 1.491 + movdqa xmm4, xmm2 1.492 + punpckldq xmm2, xmm3 1.493 + punpckhdq xmm4, xmm3 1.494 + 1.495 + pshufd xmm2, xmm2, 11011000b 1.496 + pshufd xmm3, xmm4, 11011000b 1.497 + 1.498 + ; insert DC component 1.499 + pinsrw xmm0, [rdx], 0 1.500 + pinsrw xmm0, [rdx+2], 4 1.501 + 1.502 + ; first pass 1.503 + psubw xmm0, xmm2 ; b1 = 0-2 1.504 + paddw xmm2, xmm2 ; 1.505 + 1.506 + movdqa xmm5, xmm1 1.507 + paddw xmm2, xmm0 ; a1 = 0+2 1.508 + 1.509 + pmulhw xmm5, [GLOBAL(x_s1sqr2)] 1.510 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 1.511 + 1.512 + movdqa xmm7, xmm3 1.513 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 1.514 + 1.515 + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 1.516 + psubw xmm7, xmm5 ; c1 1.517 + 1.518 + movdqa xmm5, xmm1 1.519 + movdqa xmm4, xmm3 1.520 + 1.521 + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 1.522 + paddw xmm5, xmm1 1.523 + 1.524 + pmulhw xmm3, [GLOBAL(x_s1sqr2)] 1.525 + paddw xmm3, xmm4 1.526 + 1.527 + paddw xmm3, xmm5 ; d1 1.528 + movdqa xmm6, xmm2 ; a1 1.529 + 1.530 + movdqa xmm4, xmm0 ; b1 1.531 + paddw xmm2, xmm3 ;0 1.532 + 1.533 + paddw xmm4, xmm7 ;1 1.534 + psubw xmm0, xmm7 ;2 1.535 + 1.536 + psubw xmm6, xmm3 ;3 1.537 + 1.538 + ; transpose for the second pass 1.539 + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 1.540 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 1.541 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 1.542 + 1.543 + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 1.544 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 1.545 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 1.546 + 1.547 + 1.548 + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 1.549 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 1.550 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 1.551 + 1.552 + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 1.553 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 1.554 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 1.555 + 1.556 + 1.557 + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 1.558 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 1.559 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 1.560 + 1.561 + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 1.562 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 1.563 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 1.564 + 1.565 + pshufd xmm0, xmm2, 11011000b 1.566 + pshufd xmm2, xmm1, 11011000b 1.567 + 1.568 + pshufd xmm1, xmm5, 11011000b 1.569 + pshufd xmm3, xmm7, 11011000b 1.570 + 1.571 + ; second pass 1.572 + psubw xmm0, xmm2 ; b1 = 0-2 1.573 + paddw xmm2, xmm2 1.574 + 1.575 + movdqa xmm5, xmm1 1.576 + paddw xmm2, xmm0 ; a1 = 0+2 1.577 + 1.578 + pmulhw xmm5, [GLOBAL(x_s1sqr2)] 1.579 + paddw xmm5, xmm1 ; ip1 * sin(pi/8) * sqrt(2) 1.580 + 1.581 + movdqa xmm7, xmm3 1.582 + pmulhw xmm7, [GLOBAL(x_c1sqr2less1)] 1.583 + 1.584 + paddw xmm7, xmm3 ; ip3 * cos(pi/8) * sqrt(2) 1.585 + psubw xmm7, xmm5 ; c1 1.586 + 1.587 + movdqa xmm5, xmm1 1.588 + movdqa xmm4, xmm3 1.589 + 1.590 + pmulhw xmm5, [GLOBAL(x_c1sqr2less1)] 1.591 + paddw xmm5, xmm1 1.592 + 1.593 + pmulhw xmm3, [GLOBAL(x_s1sqr2)] 1.594 + paddw xmm3, xmm4 1.595 + 1.596 + paddw xmm3, xmm5 ; d1 1.597 + paddw xmm0, [GLOBAL(fours)] 1.598 + 1.599 + paddw xmm2, [GLOBAL(fours)] 1.600 + movdqa xmm6, xmm2 ; a1 1.601 + 1.602 + movdqa xmm4, xmm0 ; b1 1.603 + paddw xmm2, xmm3 ;0 1.604 + 1.605 + paddw xmm4, xmm7 ;1 1.606 + psubw xmm0, xmm7 ;2 1.607 + 1.608 + psubw xmm6, xmm3 ;3 1.609 + psraw xmm2, 3 1.610 + 1.611 + psraw xmm0, 3 1.612 + psraw xmm4, 3 1.613 + 1.614 + psraw xmm6, 3 1.615 + 1.616 + ; transpose to save 1.617 + movdqa xmm7, xmm2 ; 103 102 101 100 003 002 001 000 1.618 + punpcklwd xmm2, xmm0 ; 007 003 006 002 005 001 004 000 1.619 + punpckhwd xmm7, xmm0 ; 107 103 106 102 105 101 104 100 1.620 + 1.621 + movdqa xmm5, xmm4 ; 111 110 109 108 011 010 009 008 1.622 + punpcklwd xmm4, xmm6 ; 015 011 014 010 013 009 012 008 1.623 + punpckhwd xmm5, xmm6 ; 115 111 114 110 113 109 112 108 1.624 + 1.625 + 1.626 + movdqa xmm1, xmm2 ; 007 003 006 002 005 001 004 000 1.627 + punpckldq xmm2, xmm4 ; 013 009 005 001 012 008 004 000 1.628 + punpckhdq xmm1, xmm4 ; 015 011 007 003 014 010 006 002 1.629 + 1.630 + movdqa xmm6, xmm7 ; 107 103 106 102 105 101 104 100 1.631 + punpckldq xmm7, xmm5 ; 113 109 105 101 112 108 104 100 1.632 + punpckhdq xmm6, xmm5 ; 115 111 107 103 114 110 106 102 1.633 + 1.634 + 1.635 + movdqa xmm5, xmm2 ; 013 009 005 001 012 008 004 000 1.636 + punpckldq xmm2, xmm7 ; 112 108 012 008 104 100 004 000 1.637 + punpckhdq xmm5, xmm7 ; 113 109 013 009 105 101 005 001 1.638 + 1.639 + movdqa xmm7, xmm1 ; 015 011 007 003 014 010 006 002 1.640 + punpckldq xmm1, xmm6 ; 114 110 014 010 106 102 006 002 1.641 + punpckhdq xmm7, xmm6 ; 115 111 015 011 107 103 007 003 1.642 + 1.643 + pshufd xmm0, xmm2, 11011000b 1.644 + pshufd xmm2, xmm1, 11011000b 1.645 + 1.646 + pshufd xmm1, xmm5, 11011000b 1.647 + pshufd xmm3, xmm7, 11011000b 1.648 + 1.649 + pxor xmm7, xmm7 1.650 + 1.651 + ; Load up predict blocks 1.652 + movsxd rdx, dword ptr arg(3) ; dst_stride 1.653 + movq xmm4, [rdi] 1.654 + movq xmm5, [rdi+rdx] 1.655 + lea rcx, [rdx + rdx*2] 1.656 + 1.657 + punpcklbw xmm4, xmm7 1.658 + punpcklbw xmm5, xmm7 1.659 + 1.660 + paddw xmm0, xmm4 1.661 + paddw xmm1, xmm5 1.662 + 1.663 + movq xmm4, [rdi+rdx*2] 1.664 + movq xmm5, [rdi+rcx] 1.665 + 1.666 + punpcklbw xmm4, xmm7 1.667 + punpcklbw xmm5, xmm7 1.668 + 1.669 + paddw xmm2, xmm4 1.670 + paddw xmm3, xmm5 1.671 + 1.672 +.finish: 1.673 + 1.674 + ; pack up before storing 1.675 + packuswb xmm0, xmm7 1.676 + packuswb xmm1, xmm7 1.677 + packuswb xmm2, xmm7 1.678 + packuswb xmm3, xmm7 1.679 + 1.680 + ; Load destination stride before writing out, 1.681 + ; doesn't need to persist 1.682 + movsxd rdx, dword ptr arg(3) ; dst_stride 1.683 + 1.684 + ; store blocks back out 1.685 + movq [rdi], xmm0 1.686 + movq [rdi + rdx], xmm1 1.687 + 1.688 + lea rdi, [rdi + 2*rdx] 1.689 + 1.690 + movq [rdi], xmm2 1.691 + movq [rdi + rdx], xmm3 1.692 + 1.693 + 1.694 + ; begin epilog 1.695 + pop rdi 1.696 + RESTORE_GOT 1.697 + RESTORE_XMM 1.698 + UNSHADOW_ARGS 1.699 + pop rbp 1.700 + ret 1.701 + 1.702 +SECTION_RODATA 1.703 +align 16 1.704 +fours: 1.705 + times 8 dw 0x0004 1.706 +align 16 1.707 +x_s1sqr2: 1.708 + times 8 dw 0x8A8C 1.709 +align 16 1.710 +x_c1sqr2less1: 1.711 + times 8 dw 0x4E7B