1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/encoder/x86/dct_sse2.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,432 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +%macro STACK_FRAME_CREATE 0 1.18 +%if ABI_IS_32BIT 1.19 + %define input rsi 1.20 + %define output rdi 1.21 + %define pitch rax 1.22 + push rbp 1.23 + mov rbp, rsp 1.24 + GET_GOT rbx 1.25 + push rsi 1.26 + push rdi 1.27 + ; end prolog 1.28 + 1.29 + mov rsi, arg(0) 1.30 + mov rdi, arg(1) 1.31 + 1.32 + movsxd rax, dword ptr arg(2) 1.33 + lea rcx, [rsi + rax*2] 1.34 +%else 1.35 + %if LIBVPX_YASM_WIN64 1.36 + %define input rcx 1.37 + %define output rdx 1.38 + %define pitch r8 1.39 + SAVE_XMM 7, u 1.40 + %else 1.41 + %define input rdi 1.42 + %define output rsi 1.43 + %define pitch rdx 1.44 + %endif 1.45 +%endif 1.46 +%endmacro 1.47 + 1.48 +%macro STACK_FRAME_DESTROY 0 1.49 + %define input 1.50 + %define output 1.51 + %define pitch 1.52 + 1.53 +%if ABI_IS_32BIT 1.54 + pop rdi 1.55 + pop rsi 1.56 + RESTORE_GOT 1.57 + pop rbp 1.58 +%else 1.59 + %if LIBVPX_YASM_WIN64 1.60 + RESTORE_XMM 1.61 + %endif 1.62 +%endif 1.63 + ret 1.64 +%endmacro 1.65 + 1.66 +;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) 1.67 +global sym(vp8_short_fdct4x4_sse2) PRIVATE 1.68 +sym(vp8_short_fdct4x4_sse2): 1.69 + 1.70 + STACK_FRAME_CREATE 1.71 + 1.72 + movq xmm0, MMWORD PTR[input ] ;03 02 01 00 1.73 + movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 1.74 + lea input, [input+2*pitch] 1.75 + movq xmm1, MMWORD PTR[input ] ;23 22 21 20 1.76 + movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 1.77 + 1.78 + punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 1.79 + punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 1.80 + 1.81 + movdqa xmm2, xmm0 1.82 + punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 1.83 + punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 1.84 + movdqa xmm1, xmm0 1.85 + punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 1.86 + pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx 1.87 + pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx 1.88 + 1.89 + punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 1.90 + movdqa xmm3, xmm0 1.91 + paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 1.92 + psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 1.93 + psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 1.94 + psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 1.95 + 1.96 + movdqa xmm1, xmm0 1.97 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 1.98 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 1.99 + movdqa xmm4, xmm3 1.100 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 1.101 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 1.102 + 1.103 + paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] 1.104 + paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] 1.105 + psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 1.106 + psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 1.107 + 1.108 + packssdw xmm0, xmm1 ;op[2] op[0] 1.109 + packssdw xmm3, xmm4 ;op[3] op[1] 1.110 + ; 23 22 21 20 03 02 01 00 1.111 + ; 1.112 + ; 33 32 31 30 13 12 11 10 1.113 + ; 1.114 + movdqa xmm2, xmm0 1.115 + punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 1.116 + punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 1.117 + 1.118 + movdqa xmm3, xmm0 1.119 + punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 1.120 + punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 1.121 + movdqa xmm2, xmm0 1.122 + punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 1.123 + punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 1.124 + 1.125 + movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] 1.126 + pshufd xmm2, xmm2, 04eh 1.127 + movdqa xmm3, xmm0 1.128 + paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 1.129 + psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 1.130 + 1.131 + pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 1.132 + movdqa xmm2, xmm3 ;save d1 for compare 1.133 + pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 1.134 + pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 1.135 + pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 1.136 + pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 1.137 + pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 1.138 + movdqa xmm1, xmm0 1.139 + pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 1.140 + pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 1.141 + 1.142 + pxor xmm4, xmm4 ;zero out for compare 1.143 + paddd xmm0, xmm5 1.144 + paddd xmm1, xmm5 1.145 + pcmpeqw xmm2, xmm4 1.146 + psrad xmm0, 4 ;(a1 + b1 + 7)>>4 1.147 + psrad xmm1, 4 ;(a1 - b1 + 7)>>4 1.148 + pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, 1.149 + ;and keep bit 0 of lower 1.150 + 1.151 + movdqa xmm4, xmm3 1.152 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 1.153 + pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 1.154 + paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] 1.155 + paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] 1.156 + packssdw xmm0, xmm1 ;op[8] op[0] 1.157 + psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 1.158 + psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 1.159 + 1.160 + packssdw xmm3, xmm4 ;op[12] op[4] 1.161 + movdqa xmm1, xmm0 1.162 + paddw xmm3, xmm2 ;op[4] += (d1!=0) 1.163 + punpcklqdq xmm0, xmm3 ;op[4] op[0] 1.164 + punpckhqdq xmm1, xmm3 ;op[12] op[8] 1.165 + 1.166 + movdqa XMMWORD PTR[output + 0], xmm0 1.167 + movdqa XMMWORD PTR[output + 16], xmm1 1.168 + 1.169 + STACK_FRAME_DESTROY 1.170 + 1.171 +;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) 1.172 +global sym(vp8_short_fdct8x4_sse2) PRIVATE 1.173 +sym(vp8_short_fdct8x4_sse2): 1.174 + 1.175 + STACK_FRAME_CREATE 1.176 + 1.177 + ; read the input data 1.178 + movdqa xmm0, [input ] 1.179 + movdqa xmm2, [input+ pitch] 1.180 + lea input, [input+2*pitch] 1.181 + movdqa xmm4, [input ] 1.182 + movdqa xmm3, [input+ pitch] 1.183 + 1.184 + ; transpose for the first stage 1.185 + movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 1.186 + movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 1.187 + 1.188 + punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 1.189 + punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 1.190 + 1.191 + punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 1.192 + punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 1.193 + 1.194 + movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 1.195 + punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 1.196 + 1.197 + punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 1.198 + 1.199 + movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 1.200 + punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 1.201 + 1.202 + punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 1.203 + movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 1.204 + 1.205 + punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 1.206 + punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 1.207 + 1.208 + movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 1.209 + punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 1.210 + 1.211 + punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 1.212 + 1.213 + ; xmm0 0 1.214 + ; xmm1 1 1.215 + ; xmm2 2 1.216 + ; xmm3 3 1.217 + 1.218 + ; first stage 1.219 + movdqa xmm5, xmm0 1.220 + movdqa xmm4, xmm1 1.221 + 1.222 + paddw xmm0, xmm3 ; a1 = 0 + 3 1.223 + paddw xmm1, xmm2 ; b1 = 1 + 2 1.224 + 1.225 + psubw xmm4, xmm2 ; c1 = 1 - 2 1.226 + psubw xmm5, xmm3 ; d1 = 0 - 3 1.227 + 1.228 + psllw xmm5, 3 1.229 + psllw xmm4, 3 1.230 + 1.231 + psllw xmm0, 3 1.232 + psllw xmm1, 3 1.233 + 1.234 + ; output 0 and 2 1.235 + movdqa xmm2, xmm0 ; a1 1.236 + 1.237 + paddw xmm0, xmm1 ; op[0] = a1 + b1 1.238 + psubw xmm2, xmm1 ; op[2] = a1 - b1 1.239 + 1.240 + ; output 1 and 3 1.241 + ; interleave c1, d1 1.242 + movdqa xmm1, xmm5 ; d1 1.243 + punpcklwd xmm1, xmm4 ; c1 d1 1.244 + punpckhwd xmm5, xmm4 ; c1 d1 1.245 + 1.246 + movdqa xmm3, xmm1 1.247 + movdqa xmm4, xmm5 1.248 + 1.249 + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.250 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.251 + 1.252 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.253 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.254 + 1.255 + paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] 1.256 + paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] 1.257 + paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] 1.258 + paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] 1.259 + 1.260 + psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 1.261 + psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 1.262 + psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 1.263 + psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 1.264 + 1.265 + packssdw xmm1, xmm4 ; op[1] 1.266 + packssdw xmm3, xmm5 ; op[3] 1.267 + 1.268 + ; done with vertical 1.269 + ; transpose for the second stage 1.270 + movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 1.271 + movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 1.272 + 1.273 + punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 1.274 + punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 1.275 + 1.276 + punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 1.277 + punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 1.278 + 1.279 + movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 1.280 + punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 1.281 + 1.282 + punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 1.283 + 1.284 + movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 1.285 + punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 1.286 + 1.287 + punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 1.288 + movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 1.289 + 1.290 + punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 1.291 + punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 1.292 + 1.293 + movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 1.294 + punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 1.295 + 1.296 + punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 1.297 + 1.298 + ; xmm0 0 1.299 + ; xmm1 4 1.300 + ; xmm2 1 1.301 + ; xmm3 3 1.302 + 1.303 + movdqa xmm5, xmm0 1.304 + movdqa xmm2, xmm1 1.305 + 1.306 + paddw xmm0, xmm3 ; a1 = 0 + 3 1.307 + paddw xmm1, xmm4 ; b1 = 1 + 2 1.308 + 1.309 + psubw xmm4, xmm2 ; c1 = 1 - 2 1.310 + psubw xmm5, xmm3 ; d1 = 0 - 3 1.311 + 1.312 + pxor xmm6, xmm6 ; zero out for compare 1.313 + 1.314 + pcmpeqw xmm6, xmm5 ; d1 != 0 1.315 + 1.316 + pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, 1.317 + ; and keep bit 0 of lower 1.318 + 1.319 + ; output 0 and 2 1.320 + movdqa xmm2, xmm0 ; a1 1.321 + 1.322 + paddw xmm0, xmm1 ; a1 + b1 1.323 + psubw xmm2, xmm1 ; a1 - b1 1.324 + 1.325 + paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] 1.326 + paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] 1.327 + 1.328 + psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 1.329 + psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 1.330 + 1.331 + ; output 1 and 3 1.332 + ; interleave c1, d1 1.333 + movdqa xmm1, xmm5 ; d1 1.334 + punpcklwd xmm1, xmm4 ; c1 d1 1.335 + punpckhwd xmm5, xmm4 ; c1 d1 1.336 + 1.337 + movdqa xmm3, xmm1 1.338 + movdqa xmm4, xmm5 1.339 + 1.340 + pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.341 + pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 1.342 + 1.343 + pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.344 + pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 1.345 + 1.346 + paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] 1.347 + paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] 1.348 + paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] 1.349 + paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] 1.350 + 1.351 + psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 1.352 + psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 1.353 + psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 1.354 + psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 1.355 + 1.356 + packssdw xmm1, xmm4 ; op[4] 1.357 + packssdw xmm3, xmm5 ; op[12] 1.358 + 1.359 + paddw xmm1, xmm6 ; op[4] += (d1!=0) 1.360 + 1.361 + movdqa xmm4, xmm0 1.362 + movdqa xmm5, xmm2 1.363 + 1.364 + punpcklqdq xmm0, xmm1 1.365 + punpckhqdq xmm4, xmm1 1.366 + 1.367 + punpcklqdq xmm2, xmm3 1.368 + punpckhqdq xmm5, xmm3 1.369 + 1.370 + movdqa XMMWORD PTR[output + 0 ], xmm0 1.371 + movdqa XMMWORD PTR[output + 16], xmm2 1.372 + movdqa XMMWORD PTR[output + 32], xmm4 1.373 + movdqa XMMWORD PTR[output + 48], xmm5 1.374 + 1.375 + STACK_FRAME_DESTROY 1.376 + 1.377 +SECTION_RODATA 1.378 +align 16 1.379 +_5352_2217: 1.380 + dw 5352 1.381 + dw 2217 1.382 + dw 5352 1.383 + dw 2217 1.384 + dw 5352 1.385 + dw 2217 1.386 + dw 5352 1.387 + dw 2217 1.388 +align 16 1.389 +_2217_neg5352: 1.390 + dw 2217 1.391 + dw -5352 1.392 + dw 2217 1.393 + dw -5352 1.394 + dw 2217 1.395 + dw -5352 1.396 + dw 2217 1.397 + dw -5352 1.398 +align 16 1.399 +_mult_add: 1.400 + times 8 dw 1 1.401 +align 16 1.402 +_cmp_mask: 1.403 + times 4 dw 1 1.404 + times 4 dw 0 1.405 +align 16 1.406 +_cmp_mask8x4: 1.407 + times 8 dw 1 1.408 +align 16 1.409 +_mult_sub: 1.410 + dw 1 1.411 + dw -1 1.412 + dw 1 1.413 + dw -1 1.414 + dw 1 1.415 + dw -1 1.416 + dw 1 1.417 + dw -1 1.418 +align 16 1.419 +_7: 1.420 + times 4 dd 7 1.421 +align 16 1.422 +_7w: 1.423 + times 8 dw 7 1.424 +align 16 1.425 +_14500: 1.426 + times 4 dd 14500 1.427 +align 16 1.428 +_7500: 1.429 + times 4 dd 7500 1.430 +align 16 1.431 +_12000: 1.432 + times 4 dd 12000 1.433 +align 16 1.434 +_51000: 1.435 + times 4 dd 51000