1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/dequantize_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,258 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 + 1.18 +;void vp8_dequantize_b_impl_mmx(short *sq, short *dq, short *q) 1.19 +global sym(vp8_dequantize_b_impl_mmx) PRIVATE 1.20 +sym(vp8_dequantize_b_impl_mmx): 1.21 + push rbp 1.22 + mov rbp, rsp 1.23 + SHADOW_ARGS_TO_STACK 3 1.24 + push rsi 1.25 + push rdi 1.26 + ; end prolog 1.27 + 1.28 + mov rsi, arg(0) ;sq 1.29 + mov rdi, arg(1) ;dq 1.30 + mov rax, arg(2) ;q 1.31 + 1.32 + movq mm1, [rsi] 1.33 + pmullw mm1, [rax+0] ; mm4 *= kernel 0 modifiers. 1.34 + movq [rdi], mm1 1.35 + 1.36 + movq mm1, [rsi+8] 1.37 + pmullw mm1, [rax+8] ; mm4 *= kernel 0 modifiers. 1.38 + movq [rdi+8], mm1 1.39 + 1.40 + movq mm1, [rsi+16] 1.41 + pmullw mm1, [rax+16] ; mm4 *= kernel 0 modifiers. 1.42 + movq [rdi+16], mm1 1.43 + 1.44 + movq mm1, [rsi+24] 1.45 + pmullw mm1, [rax+24] ; mm4 *= kernel 0 modifiers. 1.46 + movq [rdi+24], mm1 1.47 + 1.48 + ; begin epilog 1.49 + pop rdi 1.50 + pop rsi 1.51 + UNSHADOW_ARGS 1.52 + pop rbp 1.53 + ret 1.54 + 1.55 + 1.56 +;void dequant_idct_add_mmx( 1.57 +;short *input, 0 1.58 +;short *dq, 1 1.59 +;unsigned char *dest, 2 1.60 +;int stride) 3 1.61 +global sym(vp8_dequant_idct_add_mmx) PRIVATE 1.62 +sym(vp8_dequant_idct_add_mmx): 1.63 + push rbp 1.64 + mov rbp, rsp 1.65 + SHADOW_ARGS_TO_STACK 4 1.66 + GET_GOT rbx 1.67 + push rdi 1.68 + ; end prolog 1.69 + 1.70 + mov rax, arg(0) ;input 1.71 + mov rdx, arg(1) ;dq 1.72 + 1.73 + 1.74 + movq mm0, [rax ] 1.75 + pmullw mm0, [rdx] 1.76 + 1.77 + movq mm1, [rax +8] 1.78 + pmullw mm1, [rdx +8] 1.79 + 1.80 + movq mm2, [rax+16] 1.81 + pmullw mm2, [rdx+16] 1.82 + 1.83 + movq mm3, [rax+24] 1.84 + pmullw mm3, [rdx+24] 1.85 + 1.86 + mov rdx, arg(2) ;dest 1.87 + 1.88 + pxor mm7, mm7 1.89 + 1.90 + 1.91 + movq [rax], mm7 1.92 + movq [rax+8], mm7 1.93 + 1.94 + movq [rax+16],mm7 1.95 + movq [rax+24],mm7 1.96 + 1.97 + 1.98 + movsxd rdi, dword ptr arg(3) ;stride 1.99 + 1.100 + psubw mm0, mm2 ; b1= 0-2 1.101 + paddw mm2, mm2 ; 1.102 + 1.103 + movq mm5, mm1 1.104 + paddw mm2, mm0 ; a1 =0+2 1.105 + 1.106 + pmulhw mm5, [GLOBAL(x_s1sqr2)]; 1.107 + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 1.108 + 1.109 + movq mm7, mm3 ; 1.110 + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 1.111 + 1.112 + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 1.113 + psubw mm7, mm5 ; c1 1.114 + 1.115 + movq mm5, mm1 1.116 + movq mm4, mm3 1.117 + 1.118 + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 1.119 + paddw mm5, mm1 1.120 + 1.121 + pmulhw mm3, [GLOBAL(x_s1sqr2)] 1.122 + paddw mm3, mm4 1.123 + 1.124 + paddw mm3, mm5 ; d1 1.125 + movq mm6, mm2 ; a1 1.126 + 1.127 + movq mm4, mm0 ; b1 1.128 + paddw mm2, mm3 ;0 1.129 + 1.130 + paddw mm4, mm7 ;1 1.131 + psubw mm0, mm7 ;2 1.132 + 1.133 + psubw mm6, mm3 ;3 1.134 + 1.135 + movq mm1, mm2 ; 03 02 01 00 1.136 + movq mm3, mm4 ; 23 22 21 20 1.137 + 1.138 + punpcklwd mm1, mm0 ; 11 01 10 00 1.139 + punpckhwd mm2, mm0 ; 13 03 12 02 1.140 + 1.141 + punpcklwd mm3, mm6 ; 31 21 30 20 1.142 + punpckhwd mm4, mm6 ; 33 23 32 22 1.143 + 1.144 + movq mm0, mm1 ; 11 01 10 00 1.145 + movq mm5, mm2 ; 13 03 12 02 1.146 + 1.147 + punpckldq mm0, mm3 ; 30 20 10 00 1.148 + punpckhdq mm1, mm3 ; 31 21 11 01 1.149 + 1.150 + punpckldq mm2, mm4 ; 32 22 12 02 1.151 + punpckhdq mm5, mm4 ; 33 23 13 03 1.152 + 1.153 + movq mm3, mm5 ; 33 23 13 03 1.154 + 1.155 + psubw mm0, mm2 ; b1= 0-2 1.156 + paddw mm2, mm2 ; 1.157 + 1.158 + movq mm5, mm1 1.159 + paddw mm2, mm0 ; a1 =0+2 1.160 + 1.161 + pmulhw mm5, [GLOBAL(x_s1sqr2)]; 1.162 + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 1.163 + 1.164 + movq mm7, mm3 ; 1.165 + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 1.166 + 1.167 + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 1.168 + psubw mm7, mm5 ; c1 1.169 + 1.170 + movq mm5, mm1 1.171 + movq mm4, mm3 1.172 + 1.173 + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 1.174 + paddw mm5, mm1 1.175 + 1.176 + pmulhw mm3, [GLOBAL(x_s1sqr2)] 1.177 + paddw mm3, mm4 1.178 + 1.179 + paddw mm3, mm5 ; d1 1.180 + paddw mm0, [GLOBAL(fours)] 1.181 + 1.182 + paddw mm2, [GLOBAL(fours)] 1.183 + movq mm6, mm2 ; a1 1.184 + 1.185 + movq mm4, mm0 ; b1 1.186 + paddw mm2, mm3 ;0 1.187 + 1.188 + paddw mm4, mm7 ;1 1.189 + psubw mm0, mm7 ;2 1.190 + 1.191 + psubw mm6, mm3 ;3 1.192 + psraw mm2, 3 1.193 + 1.194 + psraw mm0, 3 1.195 + psraw mm4, 3 1.196 + 1.197 + psraw mm6, 3 1.198 + 1.199 + movq mm1, mm2 ; 03 02 01 00 1.200 + movq mm3, mm4 ; 23 22 21 20 1.201 + 1.202 + punpcklwd mm1, mm0 ; 11 01 10 00 1.203 + punpckhwd mm2, mm0 ; 13 03 12 02 1.204 + 1.205 + punpcklwd mm3, mm6 ; 31 21 30 20 1.206 + punpckhwd mm4, mm6 ; 33 23 32 22 1.207 + 1.208 + movq mm0, mm1 ; 11 01 10 00 1.209 + movq mm5, mm2 ; 13 03 12 02 1.210 + 1.211 + punpckldq mm0, mm3 ; 30 20 10 00 1.212 + punpckhdq mm1, mm3 ; 31 21 11 01 1.213 + 1.214 + punpckldq mm2, mm4 ; 32 22 12 02 1.215 + punpckhdq mm5, mm4 ; 33 23 13 03 1.216 + 1.217 + pxor mm7, mm7 1.218 + 1.219 + movd mm4, [rdx] 1.220 + punpcklbw mm4, mm7 1.221 + paddsw mm0, mm4 1.222 + packuswb mm0, mm7 1.223 + movd [rdx], mm0 1.224 + 1.225 + movd mm4, [rdx+rdi] 1.226 + punpcklbw mm4, mm7 1.227 + paddsw mm1, mm4 1.228 + packuswb mm1, mm7 1.229 + movd [rdx+rdi], mm1 1.230 + 1.231 + movd mm4, [rdx+2*rdi] 1.232 + punpcklbw mm4, mm7 1.233 + paddsw mm2, mm4 1.234 + packuswb mm2, mm7 1.235 + movd [rdx+rdi*2], mm2 1.236 + 1.237 + add rdx, rdi 1.238 + 1.239 + movd mm4, [rdx+2*rdi] 1.240 + punpcklbw mm4, mm7 1.241 + paddsw mm5, mm4 1.242 + packuswb mm5, mm7 1.243 + movd [rdx+rdi*2], mm5 1.244 + 1.245 + ; begin epilog 1.246 + pop rdi 1.247 + RESTORE_GOT 1.248 + UNSHADOW_ARGS 1.249 + pop rbp 1.250 + ret 1.251 + 1.252 +SECTION_RODATA 1.253 +align 16 1.254 +x_s1sqr2: 1.255 + times 4 dw 0x8A8C 1.256 +align 16 1.257 +x_c1sqr2less1: 1.258 + times 4 dw 0x4E7B 1.259 +align 16 1.260 +fours: 1.261 + times 4 dw 0x0004