michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;void vp8_short_inv_walsh4x4_mmx(short *input, short *output) michael@0: global sym(vp8_short_inv_walsh4x4_mmx) PRIVATE michael@0: sym(vp8_short_inv_walsh4x4_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 2 michael@0: ; end prolog michael@0: michael@0: mov rdx, arg(0) michael@0: mov rax, 30003h michael@0: michael@0: movq mm0, [rdx + 0] ;ip[0] michael@0: movq mm1, [rdx + 8] ;ip[4] michael@0: movq mm7, rax michael@0: michael@0: movq mm2, [rdx + 16] ;ip[8] michael@0: movq mm3, [rdx + 24] ;ip[12] michael@0: punpcklwd mm7, mm7 ;0003000300030003h michael@0: mov rdx, arg(1) michael@0: michael@0: movq mm4, mm0 michael@0: movq mm5, mm1 michael@0: michael@0: paddw mm4, mm3 ;ip[0] + ip[12] aka al michael@0: paddw mm5, mm2 ;ip[4] + ip[8] aka bl michael@0: michael@0: movq mm6, mm4 ;temp al michael@0: paddw mm4, mm5 ;al + bl michael@0: psubw mm6, mm5 ;al - bl michael@0: michael@0: psubw mm0, mm3 ;ip[0] - ip[12] aka d1 michael@0: psubw mm1, mm2 ;ip[4] - ip[8] aka c1 michael@0: michael@0: movq mm5, mm0 ;temp dl michael@0: paddw mm0, mm1 ;dl + cl michael@0: psubw mm5, mm1 ;dl - cl michael@0: michael@0: ; 03 02 01 00 michael@0: ; 13 12 11 10 michael@0: ; 23 22 21 20 michael@0: ; 33 32 31 30 michael@0: michael@0: movq mm3, mm4 ; 03 02 01 00 michael@0: punpcklwd mm4, mm0 ; 11 01 10 00 michael@0: punpckhwd mm3, mm0 ; 13 03 12 02 michael@0: michael@0: movq mm1, mm6 ; 23 22 21 20 michael@0: punpcklwd mm6, mm5 ; 31 21 30 20 michael@0: punpckhwd mm1, mm5 ; 33 23 32 22 michael@0: michael@0: movq mm0, mm4 ; 11 01 10 00 michael@0: movq mm2, mm3 ; 13 03 12 02 michael@0: michael@0: punpckldq mm0, mm6 ; 30 20 10 00 aka ip[0] michael@0: punpckhdq mm4, mm6 ; 31 21 11 01 aka ip[4] michael@0: michael@0: punpckldq mm2, mm1 ; 32 22 12 02 aka ip[8] michael@0: punpckhdq mm3, mm1 ; 33 23 13 03 aka ip[12] michael@0: ;~~~~~~~~~~~~~~~~~~~~~ michael@0: movq mm1, mm0 michael@0: movq mm5, mm4 michael@0: paddw mm1, mm3 ;ip[0] + ip[12] aka al michael@0: paddw mm5, mm2 ;ip[4] + ip[8] aka bl michael@0: michael@0: movq mm6, mm1 ;temp al michael@0: paddw mm1, mm5 ;al + bl michael@0: psubw mm6, mm5 ;al - bl michael@0: paddw mm1, mm7 michael@0: paddw mm6, mm7 michael@0: psraw mm1, 3 michael@0: psraw mm6, 3 michael@0: michael@0: psubw mm0, mm3 ;ip[0] - ip[12] aka d1 michael@0: psubw mm4, mm2 ;ip[4] - ip[8] aka c1 michael@0: michael@0: movq mm5, mm0 ;temp dl michael@0: paddw mm0, mm4 ;dl + cl michael@0: psubw mm5, mm4 ;dl - cl michael@0: paddw mm0, mm7 michael@0: paddw mm5, mm7 michael@0: psraw mm0, 3 michael@0: psraw mm5, 3 michael@0: ;~~~~~~~~~~~~~~~~~~~~~ michael@0: michael@0: movd eax, mm1 michael@0: movd ecx, mm0 michael@0: psrlq mm0, 32 michael@0: psrlq mm1, 32 michael@0: mov word ptr[rdx+32*0], ax michael@0: mov word ptr[rdx+32*1], cx michael@0: shr eax, 16 michael@0: shr ecx, 16 michael@0: mov word ptr[rdx+32*4], ax michael@0: mov word ptr[rdx+32*5], cx michael@0: movd eax, mm1 michael@0: movd ecx, mm0 michael@0: mov word ptr[rdx+32*8], ax michael@0: mov word ptr[rdx+32*9], cx michael@0: shr eax, 16 michael@0: shr ecx, 16 michael@0: mov word ptr[rdx+32*12], ax michael@0: mov word ptr[rdx+32*13], cx michael@0: michael@0: movd eax, mm6 michael@0: movd ecx, mm5 michael@0: psrlq mm5, 32 michael@0: psrlq mm6, 32 michael@0: mov word ptr[rdx+32*2], ax michael@0: mov word ptr[rdx+32*3], cx michael@0: shr eax, 16 michael@0: shr ecx, 16 michael@0: mov word ptr[rdx+32*6], ax michael@0: mov word ptr[rdx+32*7], cx michael@0: movd eax, mm6 michael@0: movd ecx, mm5 michael@0: mov word ptr[rdx+32*10], ax michael@0: mov word ptr[rdx+32*11], cx michael@0: shr eax, 16 michael@0: shr ecx, 16 michael@0: mov word ptr[rdx+32*14], ax michael@0: mov word ptr[rdx+32*15], cx michael@0: michael@0: ; begin epilog michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: