1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libvpx/vp8/common/x86/iwalsh_mmx.asm Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,140 @@ 1.4 +; 1.5 +; Copyright (c) 2010 The WebM project authors. All Rights Reserved. 1.6 +; 1.7 +; Use of this source code is governed by a BSD-style license 1.8 +; that can be found in the LICENSE file in the root of the source 1.9 +; tree. An additional intellectual property rights grant can be found 1.10 +; in the file PATENTS. All contributing project authors may 1.11 +; be found in the AUTHORS file in the root of the source tree. 1.12 +; 1.13 + 1.14 + 1.15 +%include "vpx_ports/x86_abi_support.asm" 1.16 + 1.17 +;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) 1.18 +global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE 1.19 +sym(vp8_short_inv_walsh4x4_mmx): 1.20 + push rbp 1.21 + mov rbp, rsp 1.22 + SHADOW_ARGS_TO_STACK 2 1.23 + ; end prolog 1.24 + 1.25 + mov rdx, arg(0) 1.26 + mov rax, 30003h 1.27 + 1.28 + movq mm0, [rdx + 0] ;ip[0] 1.29 + movq mm1, [rdx + 8] ;ip[4] 1.30 + movq mm7, rax 1.31 + 1.32 + movq mm2, [rdx + 16] ;ip[8] 1.33 + movq mm3, [rdx + 24] ;ip[12] 1.34 + punpcklwd mm7, mm7 ;0003000300030003h 1.35 + mov rdx, arg(1) 1.36 + 1.37 + movq mm4, mm0 1.38 + movq mm5, mm1 1.39 + 1.40 + paddw mm4, mm3 ;ip[0] + ip[12] aka al 1.41 + paddw mm5, mm2 ;ip[4] + ip[8] aka bl 1.42 + 1.43 + movq mm6, mm4 ;temp al 1.44 + paddw mm4, mm5 ;al + bl 1.45 + psubw mm6, mm5 ;al - bl 1.46 + 1.47 + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 1.48 + psubw mm1, mm2 ;ip[4] - ip[8] aka c1 1.49 + 1.50 + movq mm5, mm0 ;temp dl 1.51 + paddw mm0, mm1 ;dl + cl 1.52 + psubw mm5, mm1 ;dl - cl 1.53 + 1.54 + ; 03 02 01 00 1.55 + ; 13 12 11 10 1.56 + ; 23 22 21 20 1.57 + ; 33 32 31 30 1.58 + 1.59 + movq mm3, mm4 ; 03 02 01 00 1.60 + punpcklwd mm4, mm0 ; 11 01 10 00 1.61 + punpckhwd mm3, mm0 ; 13 03 12 02 1.62 + 1.63 + movq mm1, mm6 ; 23 22 21 20 1.64 + punpcklwd mm6, mm5 ; 31 21 30 20 1.65 + punpckhwd mm1, mm5 ; 33 23 32 22 1.66 + 1.67 + movq mm0, mm4 ; 11 01 10 00 1.68 + movq mm2, mm3 ; 13 03 12 02 1.69 + 1.70 + punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] 1.71 + punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] 1.72 + 1.73 + punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] 1.74 + punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] 1.75 +;~~~~~~~~~~~~~~~~~~~~~ 1.76 + movq mm1, mm0 1.77 + movq mm5, mm4 1.78 + paddw mm1, mm3 ;ip[0] + ip[12] aka al 1.79 + paddw mm5, mm2 ;ip[4] + ip[8] aka bl 1.80 + 1.81 + movq mm6, mm1 ;temp al 1.82 + paddw mm1, mm5 ;al + bl 1.83 + psubw mm6, mm5 ;al - bl 1.84 + paddw mm1, mm7 1.85 + paddw mm6, mm7 1.86 + psraw mm1, 3 1.87 + psraw mm6, 3 1.88 + 1.89 + psubw mm0, mm3 ;ip[0] - ip[12] aka d1 1.90 + psubw mm4, mm2 ;ip[4] - ip[8] aka c1 1.91 + 1.92 + movq mm5, mm0 ;temp dl 1.93 + paddw mm0, mm4 ;dl + cl 1.94 + psubw mm5, mm4 ;dl - cl 1.95 + paddw mm0, mm7 1.96 + paddw mm5, mm7 1.97 + psraw mm0, 3 1.98 + psraw mm5, 3 1.99 +;~~~~~~~~~~~~~~~~~~~~~ 1.100 + 1.101 + movd eax, mm1 1.102 + movd ecx, mm0 1.103 + psrlq mm0, 32 1.104 + psrlq mm1, 32 1.105 + mov word ptr[rdx+32*0], ax 1.106 + mov word ptr[rdx+32*1], cx 1.107 + shr eax, 16 1.108 + shr ecx, 16 1.109 + mov word ptr[rdx+32*4], ax 1.110 + mov word ptr[rdx+32*5], cx 1.111 + movd eax, mm1 1.112 + movd ecx, mm0 1.113 + mov word ptr[rdx+32*8], ax 1.114 + mov word ptr[rdx+32*9], cx 1.115 + shr eax, 16 1.116 + shr ecx, 16 1.117 + mov word ptr[rdx+32*12], ax 1.118 + mov word ptr[rdx+32*13], cx 1.119 + 1.120 + movd eax, mm6 1.121 + movd ecx, mm5 1.122 + psrlq mm5, 32 1.123 + psrlq mm6, 32 1.124 + mov word ptr[rdx+32*2], ax 1.125 + mov word ptr[rdx+32*3], cx 1.126 + shr eax, 16 1.127 + shr ecx, 16 1.128 + mov word ptr[rdx+32*6], ax 1.129 + mov word ptr[rdx+32*7], cx 1.130 + movd eax, mm6 1.131 + movd ecx, mm5 1.132 + mov word ptr[rdx+32*10], ax 1.133 + mov word ptr[rdx+32*11], cx 1.134 + shr eax, 16 1.135 + shr ecx, 16 1.136 + mov word ptr[rdx+32*14], ax 1.137 + mov word ptr[rdx+32*15], cx 1.138 + 1.139 + ; begin epilog 1.140 + UNSHADOW_ARGS 1.141 + pop rbp 1.142 + ret 1.143 +