michael@0: ;
michael@0: ;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
michael@0: ;
michael@0: ;  Use of this source code is governed by a BSD-style license
michael@0: ;  that can be found in the LICENSE file in the root of the source
michael@0: ;  tree. An additional intellectual property rights grant can be found
michael@0: ;  in the file PATENTS.  All contributing project authors may
michael@0: ;  be found in the AUTHORS file in the root of the source tree.
michael@0: ;
michael@0: 
michael@0: 
michael@0: %include "vpx_ports/x86_abi_support.asm"
michael@0: 
michael@0: ; /****************************************************************************
michael@0: ; * Notes:
michael@0: ; *
michael@0: ; * This implementation makes use of 16 bit fixed point version of two multiply
michael@0: ; * constants:
michael@0: ; *        1.   sqrt(2) * cos (pi/8)
michael@0: ; *        2.   sqrt(2) * sin (pi/8)
michael@0: ; * Because the first constant is bigger than 1, to maintain the same 16 bit
michael@0: ; * fixed point precision as the second one, we use a trick of
michael@0: ; *        x * a = x + x*(a-1)
michael@0: ; * so
michael@0: ; *        x * sqrt(2) * cos (pi/8) = x + x * (sqrt(2) *cos(pi/8)-1).
michael@0: ; *
michael@0: ; * For the second constant, because of the 16bit version is 35468, which
michael@0: ; * is bigger than 32768, in signed 16 bit multiply, it becomes a negative
michael@0: ; * number.
michael@0: ; *        (x * (unsigned)35468 >> 16) = x * (signed)35468 >> 16 + x
michael@0: ; *
michael@0: ; **************************************************************************/
michael@0: 
michael@0: 
michael@0: ;void vp8_short_idct4x4llm_mmx(short *input, unsigned char *pred,
michael@0: ;int pitch, unsigned char *dest,int stride)
michael@0: global sym(vp8_short_idct4x4llm_mmx) PRIVATE
michael@0: sym(vp8_short_idct4x4llm_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 5
michael@0:     GET_GOT     rbx
michael@0:     push        rsi
michael@0:     push        rdi
michael@0:     ; end prolog
michael@0: 
michael@0:     mov         rax,    arg(0)              ;input
michael@0:     mov         rsi,    arg(1)              ;pred
michael@0: 
michael@0:     movq        mm0,    [rax   ]
michael@0:     movq        mm1,    [rax+ 8]
michael@0:     movq        mm2,    [rax+16]
michael@0:     movq        mm3,    [rax+24]
michael@0: 
michael@0: %if 0
michael@0:     pxor        mm7,    mm7
michael@0:     movq        [rax],   mm7
michael@0:     movq        [rax+8], mm7
michael@0:     movq        [rax+16],mm7
michael@0:     movq        [rax+24],mm7
michael@0: %endif
michael@0:     movsxd      rax,    dword ptr arg(2)    ;pitch
michael@0:     mov         rdx,    arg(3)              ;dest
michael@0:     movsxd      rdi,    dword ptr arg(4)    ;stride
michael@0: 
michael@0: 
michael@0:     psubw       mm0,            mm2             ; b1= 0-2
michael@0:     paddw       mm2,            mm2             ;
michael@0: 
michael@0:     movq        mm5,            mm1
michael@0:     paddw       mm2,            mm0             ; a1 =0+2
michael@0: 
michael@0:     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
michael@0:     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
michael@0: 
michael@0:     movq        mm7,            mm3             ;
michael@0:     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
michael@0: 
michael@0:     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
michael@0:     psubw       mm7,            mm5             ; c1
michael@0: 
michael@0:     movq        mm5,            mm1
michael@0:     movq        mm4,            mm3
michael@0: 
michael@0:     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
michael@0:     paddw       mm5,            mm1
michael@0: 
michael@0:     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
michael@0:     paddw       mm3,            mm4
michael@0: 
michael@0:     paddw       mm3,            mm5             ; d1
michael@0:     movq        mm6,            mm2             ; a1
michael@0: 
michael@0:     movq        mm4,            mm0             ; b1
michael@0:     paddw       mm2,            mm3             ;0
michael@0: 
michael@0:     paddw       mm4,            mm7             ;1
michael@0:     psubw       mm0,            mm7             ;2
michael@0: 
michael@0:     psubw       mm6,            mm3             ;3
michael@0: 
michael@0:     movq        mm1,            mm2             ; 03 02 01 00
michael@0:     movq        mm3,            mm4             ; 23 22 21 20
michael@0: 
michael@0:     punpcklwd   mm1,            mm0             ; 11 01 10 00
michael@0:     punpckhwd   mm2,            mm0             ; 13 03 12 02
michael@0: 
michael@0:     punpcklwd   mm3,            mm6             ; 31 21 30 20
michael@0:     punpckhwd   mm4,            mm6             ; 33 23 32 22
michael@0: 
michael@0:     movq        mm0,            mm1             ; 11 01 10 00
michael@0:     movq        mm5,            mm2             ; 13 03 12 02
michael@0: 
michael@0:     punpckldq   mm0,            mm3             ; 30 20 10 00
michael@0:     punpckhdq   mm1,            mm3             ; 31 21 11 01
michael@0: 
michael@0:     punpckldq   mm2,            mm4             ; 32 22 12 02
michael@0:     punpckhdq   mm5,            mm4             ; 33 23 13 03
michael@0: 
michael@0:     movq        mm3,            mm5             ; 33 23 13 03
michael@0: 
michael@0:     psubw       mm0,            mm2             ; b1= 0-2
michael@0:     paddw       mm2,            mm2             ;
michael@0: 
michael@0:     movq        mm5,            mm1
michael@0:     paddw       mm2,            mm0             ; a1 =0+2
michael@0: 
michael@0:     pmulhw      mm5,            [GLOBAL(x_s1sqr2)];
michael@0:     paddw       mm5,            mm1             ; ip1 * sin(pi/8) * sqrt(2)
michael@0: 
michael@0:     movq        mm7,            mm3             ;
michael@0:     pmulhw      mm7,            [GLOBAL(x_c1sqr2less1)];
michael@0: 
michael@0:     paddw       mm7,            mm3             ; ip3 * cos(pi/8) * sqrt(2)
michael@0:     psubw       mm7,            mm5             ; c1
michael@0: 
michael@0:     movq        mm5,            mm1
michael@0:     movq        mm4,            mm3
michael@0: 
michael@0:     pmulhw      mm5,            [GLOBAL(x_c1sqr2less1)]
michael@0:     paddw       mm5,            mm1
michael@0: 
michael@0:     pmulhw      mm3,            [GLOBAL(x_s1sqr2)]
michael@0:     paddw       mm3,            mm4
michael@0: 
michael@0:     paddw       mm3,            mm5             ; d1
michael@0:     paddw       mm0,            [GLOBAL(fours)]
michael@0: 
michael@0:     paddw       mm2,            [GLOBAL(fours)]
michael@0:     movq        mm6,            mm2             ; a1
michael@0: 
michael@0:     movq        mm4,            mm0             ; b1
michael@0:     paddw       mm2,            mm3             ;0
michael@0: 
michael@0:     paddw       mm4,            mm7             ;1
michael@0:     psubw       mm0,            mm7             ;2
michael@0: 
michael@0:     psubw       mm6,            mm3             ;3
michael@0:     psraw       mm2,            3
michael@0: 
michael@0:     psraw       mm0,            3
michael@0:     psraw       mm4,            3
michael@0: 
michael@0:     psraw       mm6,            3
michael@0: 
michael@0:     movq        mm1,            mm2             ; 03 02 01 00
michael@0:     movq        mm3,            mm4             ; 23 22 21 20
michael@0: 
michael@0:     punpcklwd   mm1,            mm0             ; 11 01 10 00
michael@0:     punpckhwd   mm2,            mm0             ; 13 03 12 02
michael@0: 
michael@0:     punpcklwd   mm3,            mm6             ; 31 21 30 20
michael@0:     punpckhwd   mm4,            mm6             ; 33 23 32 22
michael@0: 
michael@0:     movq        mm0,            mm1             ; 11 01 10 00
michael@0:     movq        mm5,            mm2             ; 13 03 12 02
michael@0: 
michael@0:     punpckldq   mm0,            mm3             ; 30 20 10 00
michael@0:     punpckhdq   mm1,            mm3             ; 31 21 11 01
michael@0: 
michael@0:     punpckldq   mm2,            mm4             ; 32 22 12 02
michael@0:     punpckhdq   mm5,            mm4             ; 33 23 13 03
michael@0: 
michael@0:     pxor        mm7,            mm7
michael@0: 
michael@0:     movd        mm4,            [rsi]
michael@0:     punpcklbw   mm4,            mm7
michael@0:     paddsw      mm0,            mm4
michael@0:     packuswb    mm0,            mm7
michael@0:     movd        [rdx],          mm0
michael@0: 
michael@0:     movd        mm4,            [rsi+rax]
michael@0:     punpcklbw   mm4,            mm7
michael@0:     paddsw      mm1,            mm4
michael@0:     packuswb    mm1,            mm7
michael@0:     movd        [rdx+rdi],      mm1
michael@0: 
michael@0:     movd        mm4,            [rsi+2*rax]
michael@0:     punpcklbw   mm4,            mm7
michael@0:     paddsw      mm2,            mm4
michael@0:     packuswb    mm2,            mm7
michael@0:     movd        [rdx+rdi*2],    mm2
michael@0: 
michael@0:     add         rdx,            rdi
michael@0:     add         rsi,            rax
michael@0: 
michael@0:     movd        mm4,            [rsi+2*rax]
michael@0:     punpcklbw   mm4,            mm7
michael@0:     paddsw      mm5,            mm4
michael@0:     packuswb    mm5,            mm7
michael@0:     movd        [rdx+rdi*2],    mm5
michael@0: 
michael@0:     ; begin epilog
michael@0:     pop rdi
michael@0:     pop rsi
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: ;void vp8_dc_only_idct_add_mmx(
michael@0: ;short input_dc,
michael@0: ;unsigned char *pred_ptr,
michael@0: ;int pred_stride,
michael@0: ;unsigned char *dst_ptr,
michael@0: ;int stride)
michael@0: global sym(vp8_dc_only_idct_add_mmx) PRIVATE
michael@0: sym(vp8_dc_only_idct_add_mmx):
michael@0:     push        rbp
michael@0:     mov         rbp, rsp
michael@0:     SHADOW_ARGS_TO_STACK 5
michael@0:     GET_GOT     rbx
michael@0:     ; end prolog
michael@0: 
michael@0:         movd        mm5,            arg(0) ;input_dc
michael@0:         mov         rax,            arg(1) ;pred_ptr
michael@0:         movsxd      rdx,            dword ptr arg(2) ;pred_stride
michael@0: 
michael@0:         pxor        mm0,            mm0
michael@0: 
michael@0:         paddw       mm5,            [GLOBAL(fours)]
michael@0:         lea         rcx,            [rdx + rdx*2]
michael@0: 
michael@0:         psraw       mm5,            3
michael@0: 
michael@0:         punpcklwd   mm5,            mm5
michael@0: 
michael@0:         punpckldq   mm5,            mm5
michael@0: 
michael@0:         movd        mm1,            [rax]
michael@0:         movd        mm2,            [rax+rdx]
michael@0:         movd        mm3,            [rax+2*rdx]
michael@0:         movd        mm4,            [rax+rcx]
michael@0: 
michael@0:         mov         rax,            arg(3) ;d -- destination
michael@0:         movsxd      rdx,            dword ptr arg(4) ;dst_stride
michael@0: 
michael@0:         punpcklbw   mm1,            mm0
michael@0:         paddsw      mm1,            mm5
michael@0:         packuswb    mm1,            mm0              ; pack and unpack to saturate
michael@0:         lea         rcx,            [rdx + rdx*2]
michael@0: 
michael@0:         punpcklbw   mm2,            mm0
michael@0:         paddsw      mm2,            mm5
michael@0:         packuswb    mm2,            mm0              ; pack and unpack to saturate
michael@0: 
michael@0:         punpcklbw   mm3,            mm0
michael@0:         paddsw      mm3,            mm5
michael@0:         packuswb    mm3,            mm0              ; pack and unpack to saturate
michael@0: 
michael@0:         punpcklbw   mm4,            mm0
michael@0:         paddsw      mm4,            mm5
michael@0:         packuswb    mm4,            mm0              ; pack and unpack to saturate
michael@0: 
michael@0:         movd        [rax],          mm1
michael@0:         movd        [rax+rdx],      mm2
michael@0:         movd        [rax+2*rdx],    mm3
michael@0:         movd        [rax+rcx],      mm4
michael@0: 
michael@0:     ; begin epilog
michael@0:     RESTORE_GOT
michael@0:     UNSHADOW_ARGS
michael@0:     pop         rbp
michael@0:     ret
michael@0: 
michael@0: SECTION_RODATA
michael@0: align 16
michael@0: x_s1sqr2:
michael@0:     times 4 dw 0x8A8C
michael@0: align 16
michael@0: x_c1sqr2less1:
michael@0:     times 4 dw 0x4E7B
michael@0: align 16
michael@0: fours:
michael@0:     times 4 dw 0x0004