1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/idctllm_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,295 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +; /**************************************************************************** 1.18 +; * Notes: 1.19 +; * 1.20 +; * This implementation makes use of 16 bit fixed point version of two multiply 1.21 +; * constants: 1.22 +; * 1. sqrt(2) * cos (pi/8) 1.23 +; * 2. sqrt(2) * sin (pi/8) 1.24 +; * Because the first constant is bigger than 1, to maintain the same 16 bit 1.25 +; * fixed point precision as the second one, we use a trick of 1.26 +; * x * a = x + x*(a-1) 1.27 +; * so 1.28 +; * x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1). 1.29 +; * 1.30 +; * For the second constant, because of the 16bit version is 35468, which 1.31 +; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative 1.32 +; * number. 1.33 +; * (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x 1.34 +; * 1.35 +; **************************************************************************/ 1.36 + 1.37 + 1.38 +;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred, 1.39 +;int pitch, unsigned char *dest,int stride) 1.40 +global sym(vp8_short_idct4x4llm_mmx) PRIVATE 1.41 +sym(vp8_short_idct4x4llm_mmx): 1.42 + push rbp 1.43 + mov rbp, rsp 1.44 + SHADOW_ARGS_TO_STACK 5 1.45 + GET_GOT rbx 1.46 + push rsi 1.47 + push rdi 1.48 + ; end prolog 1.49 + 1.50 + mov rax, arg(0) ;input 1.51 + mov rsi, arg(1) ;pred 1.52 + 1.53 + movq mm0, [rax ] 1.54 + movq mm1, [rax+ 8] 1.55 + movq mm2, [rax+16] 1.56 + movq mm3, [rax+24] 1.57 + 1.58 +%if 0 1.59 + pxor mm7, mm7 1.60 + movq [rax], mm7 1.61 + movq [rax+8], mm7 1.62 + movq [rax+16],mm7 1.63 + movq [rax+24],mm7 1.64 +%endif 1.65 + movsxd rax, dword ptr arg(2) ;pitch 1.66 + mov rdx, arg(3) ;dest 1.67 + movsxd rdi, dword ptr arg(4) ;stride 1.68 + 1.69 + 1.70 + psubw mm0, mm2 ; b1= 0-2 1.71 + paddw mm2, mm2 ; 1.72 + 1.73 + movq mm5, mm1 1.74 + paddw mm2, mm0 ; a1 =0+2 1.75 + 1.76 + pmulhw mm5, [GLOBAL(x_s1sqr2)]; 1.77 + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 1.78 + 1.79 + movq mm7, mm3 ; 1.80 + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 1.81 + 1.82 + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 1.83 + psubw mm7, mm5 ; c1 1.84 + 1.85 + movq mm5, mm1 1.86 + movq mm4, mm3 1.87 + 1.88 + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 1.89 + paddw mm5, mm1 1.90 + 1.91 + pmulhw mm3, [GLOBAL(x_s1sqr2)] 1.92 + paddw mm3, mm4 1.93 + 1.94 + paddw mm3, mm5 ; d1 1.95 + movq mm6, mm2 ; a1 1.96 + 1.97 + movq mm4, mm0 ; b1 1.98 + paddw mm2, mm3 ;0 1.99 + 1.100 + paddw mm4, mm7 ;1 1.101 + psubw mm0, mm7 ;2 1.102 + 1.103 + psubw mm6, mm3 ;3 1.104 + 1.105 + movq mm1, mm2 ; 03 02 01 00 1.106 + movq mm3, mm4 ; 23 22 21 20 1.107 + 1.108 + punpcklwd mm1, mm0 ; 11 01 10 00 1.109 + punpckhwd mm2, mm0 ; 13 03 12 02 1.110 + 1.111 + punpcklwd mm3, mm6 ; 31 21 30 20 1.112 + punpckhwd mm4, mm6 ; 33 23 32 22 1.113 + 1.114 + movq mm0, mm1 ; 11 01 10 00 1.115 + movq mm5, mm2 ; 13 03 12 02 1.116 + 1.117 + punpckldq mm0, mm3 ; 30 20 10 00 1.118 + punpckhdq mm1, mm3 ; 31 21 11 01 1.119 + 1.120 + punpckldq mm2, mm4 ; 32 22 12 02 1.121 + punpckhdq mm5, mm4 ; 33 23 13 03 1.122 + 1.123 + movq mm3, mm5 ; 33 23 13 03 1.124 + 1.125 + psubw mm0, mm2 ; b1= 0-2 1.126 + paddw mm2, mm2 ; 1.127 + 1.128 + movq mm5, mm1 1.129 + paddw mm2, mm0 ; a1 =0+2 1.130 + 1.131 + pmulhw mm5, [GLOBAL(x_s1sqr2)]; 1.132 + paddw mm5, mm1 ; ip1 * sin(pi/8) * sqrt(2) 1.133 + 1.134 + movq mm7, mm3 ; 1.135 + pmulhw mm7, [GLOBAL(x_c1sqr2less1)]; 1.136 + 1.137 + paddw mm7, mm3 ; ip3 * cos(pi/8) * sqrt(2) 1.138 + psubw mm7, mm5 ; c1 1.139 + 1.140 + movq mm5, mm1 1.141 + movq mm4, mm3 1.142 + 1.143 + pmulhw mm5, [GLOBAL(x_c1sqr2less1)] 1.144 + paddw mm5, mm1 1.145 + 1.146 + pmulhw mm3, [GLOBAL(x_s1sqr2)] 1.147 + paddw mm3, mm4 1.148 + 1.149 + paddw mm3, mm5 ; d1 1.150 + paddw mm0, [GLOBAL(fours)] 1.151 + 1.152 + paddw mm2, [GLOBAL(fours)] 1.153 + movq mm6, mm2 ; a1 1.154 + 1.155 + movq mm4, mm0 ; b1 1.156 + paddw mm2, mm3 ;0 1.157 + 1.158 + paddw mm4, mm7 ;1 1.159 + psubw mm0, mm7 ;2 1.160 + 1.161 + psubw mm6, mm3 ;3 1.162 + psraw mm2, 3 1.163 + 1.164 + psraw mm0, 3 1.165 + psraw mm4, 3 1.166 + 1.167 + psraw mm6, 3 1.168 + 1.169 + movq mm1, mm2 ; 03 02 01 00 1.170 + movq mm3, mm4 ; 23 22 21 20 1.171 + 1.172 + punpcklwd mm1, mm0 ; 11 01 10 00 1.173 + punpckhwd mm2, mm0 ; 13 03 12 02 1.174 + 1.175 + punpcklwd mm3, mm6 ; 31 21 30 20 1.176 + punpckhwd mm4, mm6 ; 33 23 32 22 1.177 + 1.178 + movq mm0, mm1 ; 11 01 10 00 1.179 + movq mm5, mm2 ; 13 03 12 02 1.180 + 1.181 + punpckldq mm0, mm3 ; 30 20 10 00 1.182 + punpckhdq mm1, mm3 ; 31 21 11 01 1.183 + 1.184 + punpckldq mm2, mm4 ; 32 22 12 02 1.185 + punpckhdq mm5, mm4 ; 33 23 13 03 1.186 + 1.187 + pxor mm7, mm7 1.188 + 1.189 + movd mm4, [rsi] 1.190 + punpcklbw mm4, mm7 1.191 + paddsw mm0, mm4 1.192 + packuswb mm0, mm7 1.193 + movd [rdx], mm0 1.194 + 1.195 + movd mm4, [rsi+rax] 1.196 + punpcklbw mm4, mm7 1.197 + paddsw mm1, mm4 1.198 + packuswb mm1, mm7 1.199 + movd [rdx+rdi], mm1 1.200 + 1.201 + movd mm4, [rsi+2*rax] 1.202 + punpcklbw mm4, mm7 1.203 + paddsw mm2, mm4 1.204 + packuswb mm2, mm7 1.205 + movd [rdx+rdi*2], mm2 1.206 + 1.207 + add rdx, rdi 1.208 + add rsi, rax 1.209 + 1.210 + movd mm4, [rsi+2*rax] 1.211 + punpcklbw mm4, mm7 1.212 + paddsw mm5, mm4 1.213 + packuswb mm5, mm7 1.214 + movd [rdx+rdi*2], mm5 1.215 + 1.216 + ; begin epilog 1.217 + pop rdi 1.218 + pop rsi 1.219 + RESTORE_GOT 1.220 + UNSHADOW_ARGS 1.221 + pop rbp 1.222 + ret 1.223 + 1.224 +;void vp8_dc_only_idct_add_mmx( 1.225 +;short input_dc, 1.226 +;unsigned char *pred_ptr, 1.227 +;int pred_stride, 1.228 +;unsigned char *dst_ptr, 1.229 +;int stride) 1.230 +global sym(vp8_dc_only_idct_add_mmx) PRIVATE 1.231 +sym(vp8_dc_only_idct_add_mmx): 1.232 + push rbp 1.233 + mov rbp, rsp 1.234 + SHADOW_ARGS_TO_STACK 5 1.235 + GET_GOT rbx 1.236 + ; end prolog 1.237 + 1.238 + movd mm5, arg(0) ;input_dc 1.239 + mov rax, arg(1) ;pred_ptr 1.240 + movsxd rdx, dword ptr arg(2) ;pred_stride 1.241 + 1.242 + pxor mm0, mm0 1.243 + 1.244 + paddw mm5, [GLOBAL(fours)] 1.245 + lea rcx, [rdx + rdx*2] 1.246 + 1.247 + psraw mm5, 3 1.248 + 1.249 + punpcklwd mm5, mm5 1.250 + 1.251 + punpckldq mm5, mm5 1.252 + 1.253 + movd mm1, [rax] 1.254 + movd mm2, [rax+rdx] 1.255 + movd mm3, [rax+2*rdx] 1.256 + movd mm4, [rax+rcx] 1.257 + 1.258 + mov rax, arg(3) ;d -- destination 1.259 + movsxd rdx, dword ptr arg(4) ;dst_stride 1.260 + 1.261 + punpcklbw mm1, mm0 1.262 + paddsw mm1, mm5 1.263 + packuswb mm1, mm0 ; pack and unpack to saturate 1.264 + lea rcx, [rdx + rdx*2] 1.265 + 1.266 + punpcklbw mm2, mm0 1.267 + paddsw mm2, mm5 1.268 + packuswb mm2, mm0 ; pack and unpack to saturate 1.269 + 1.270 + punpcklbw mm3, mm0 1.271 + paddsw mm3, mm5 1.272 + packuswb mm3, mm0 ; pack and unpack to saturate 1.273 + 1.274 + punpcklbw mm4, mm0 1.275 + paddsw mm4, mm5 1.276 + packuswb mm4, mm0 ; pack and unpack to saturate 1.277 + 1.278 + movd [rax], mm1 1.279 + movd [rax+rdx], mm2 1.280 + movd [rax+2*rdx], mm3 1.281 + movd [rax+rcx], mm4 1.282 + 1.283 + ; begin epilog 1.284 + RESTORE_GOT 1.285 + UNSHADOW_ARGS 1.286 + pop rbp 1.287 + ret 1.288 + 1.289 +SECTION_RODATA 1.290 +align 16 1.291 +x_s1sqr2: 1.292 + times 4 dw 0x8A8C 1.293 +align 16 1.294 +x_c1sqr2less1: 1.295 + times 4 dw 0x4E7B 1.296 +align 16 1.297 +fours: 1.298 + times 4 dw 0x0004