michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: %macro STACK_FRAME_CREATE 0 michael@0: %if ABI_IS_32BIT michael@0: %define input rsi michael@0: %define output rdi michael@0: %define pitch rax michael@0: push rbp michael@0: mov rbp, rsp michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) michael@0: mov rdi, arg(1) michael@0: michael@0: movsxd rax, dword ptr arg(2) michael@0: lea rcx, [rsi + rax*2] michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: %define input rcx michael@0: %define output rdx michael@0: %define pitch r8 michael@0: SAVE_XMM 7, u michael@0: %else michael@0: %define input rdi michael@0: %define output rsi michael@0: %define pitch rdx michael@0: %endif michael@0: %endif michael@0: %endmacro michael@0: michael@0: %macro STACK_FRAME_DESTROY 0 michael@0: %define input michael@0: %define output michael@0: %define pitch michael@0: michael@0: %if ABI_IS_32BIT michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: pop rbp michael@0: %else michael@0: %if LIBVPX_YASM_WIN64 michael@0: RESTORE_XMM michael@0: %endif michael@0: %endif michael@0: ret michael@0: %endmacro michael@0: michael@0: ;void vp8_short_fdct4x4_sse2(short *input, short *output, int pitch) michael@0: global sym(vp8_short_fdct4x4_sse2) PRIVATE michael@0: sym(vp8_short_fdct4x4_sse2): michael@0: michael@0: STACK_FRAME_CREATE michael@0: michael@0: movq xmm0, MMWORD PTR[input ] ;03 02 01 00 michael@0: movq xmm2, MMWORD PTR[input+ pitch] ;13 12 11 10 michael@0: lea input, [input+2*pitch] michael@0: movq xmm1, MMWORD PTR[input ] ;23 22 21 20 michael@0: movq xmm3, MMWORD PTR[input+ pitch] ;33 32 31 30 michael@0: michael@0: punpcklqdq xmm0, xmm2 ;13 12 11 10 03 02 01 00 michael@0: punpcklqdq xmm1, xmm3 ;33 32 31 30 23 22 21 20 michael@0: michael@0: movdqa xmm2, xmm0 michael@0: punpckldq xmm0, xmm1 ;23 22 03 02 21 20 01 00 michael@0: punpckhdq xmm2, xmm1 ;33 32 13 12 31 30 11 10 michael@0: movdqa xmm1, xmm0 michael@0: punpckldq xmm0, xmm2 ;31 21 30 20 11 10 01 00 michael@0: pshufhw xmm1, xmm1, 0b1h ;22 23 02 03 xx xx xx xx michael@0: pshufhw xmm2, xmm2, 0b1h ;32 33 12 13 xx xx xx xx michael@0: michael@0: punpckhdq xmm1, xmm2 ;32 33 22 23 12 13 02 03 michael@0: movdqa xmm3, xmm0 michael@0: paddw xmm0, xmm1 ;b1 a1 b1 a1 b1 a1 b1 a1 michael@0: psubw xmm3, xmm1 ;c1 d1 c1 d1 c1 d1 c1 d1 michael@0: psllw xmm0, 3 ;b1 <<= 3 a1 <<= 3 michael@0: psllw xmm3, 3 ;c1 <<= 3 d1 <<= 3 michael@0: michael@0: movdqa xmm1, xmm0 michael@0: pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 michael@0: pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 michael@0: movdqa xmm4, xmm3 michael@0: pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 michael@0: pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)];d1*2217 - c1*5352 michael@0: michael@0: paddd xmm3, XMMWORD PTR[GLOBAL(_14500)] michael@0: paddd xmm4, XMMWORD PTR[GLOBAL(_7500)] michael@0: psrad xmm3, 12 ;(c1 * 2217 + d1 * 5352 + 14500)>>12 michael@0: psrad xmm4, 12 ;(d1 * 2217 - c1 * 5352 + 7500)>>12 michael@0: michael@0: packssdw xmm0, xmm1 ;op[2] op[0] michael@0: packssdw xmm3, xmm4 ;op[3] op[1] michael@0: ; 23 22 21 20 03 02 01 00 michael@0: ; michael@0: ; 33 32 31 30 13 12 11 10 michael@0: ; michael@0: movdqa xmm2, xmm0 michael@0: punpcklqdq xmm0, xmm3 ;13 12 11 10 03 02 01 00 michael@0: punpckhqdq xmm2, xmm3 ;23 22 21 20 33 32 31 30 michael@0: michael@0: movdqa xmm3, xmm0 michael@0: punpcklwd xmm0, xmm2 ;32 30 22 20 12 10 02 00 michael@0: punpckhwd xmm3, xmm2 ;33 31 23 21 13 11 03 01 michael@0: movdqa xmm2, xmm0 michael@0: punpcklwd xmm0, xmm3 ;13 12 11 10 03 02 01 00 michael@0: punpckhwd xmm2, xmm3 ;33 32 31 30 23 22 21 20 michael@0: michael@0: movdqa xmm5, XMMWORD PTR[GLOBAL(_7)] michael@0: pshufd xmm2, xmm2, 04eh michael@0: movdqa xmm3, xmm0 michael@0: paddw xmm0, xmm2 ;b1 b1 b1 b1 a1 a1 a1 a1 michael@0: psubw xmm3, xmm2 ;c1 c1 c1 c1 d1 d1 d1 d1 michael@0: michael@0: pshufd xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 b1 a1 a1 michael@0: movdqa xmm2, xmm3 ;save d1 for compare michael@0: pshufd xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 c1 d1 d1 michael@0: pshuflw xmm0, xmm0, 0d8h ;b1 b1 a1 a1 b1 a1 b1 a1 michael@0: pshuflw xmm3, xmm3, 0d8h ;c1 c1 d1 d1 c1 d1 c1 d1 michael@0: pshufhw xmm0, xmm0, 0d8h ;b1 a1 b1 a1 b1 a1 b1 a1 michael@0: pshufhw xmm3, xmm3, 0d8h ;c1 d1 c1 d1 c1 d1 c1 d1 michael@0: movdqa xmm1, xmm0 michael@0: pmaddwd xmm0, XMMWORD PTR[GLOBAL(_mult_add)] ;a1 + b1 michael@0: pmaddwd xmm1, XMMWORD PTR[GLOBAL(_mult_sub)] ;a1 - b1 michael@0: michael@0: pxor xmm4, xmm4 ;zero out for compare michael@0: paddd xmm0, xmm5 michael@0: paddd xmm1, xmm5 michael@0: pcmpeqw xmm2, xmm4 michael@0: psrad xmm0, 4 ;(a1 + b1 + 7)>>4 michael@0: psrad xmm1, 4 ;(a1 - b1 + 7)>>4 michael@0: pandn xmm2, XMMWORD PTR[GLOBAL(_cmp_mask)] ;clear upper, michael@0: ;and keep bit 0 of lower michael@0: michael@0: movdqa xmm4, xmm3 michael@0: pmaddwd xmm3, XMMWORD PTR[GLOBAL(_5352_2217)] ;c1*2217 + d1*5352 michael@0: pmaddwd xmm4, XMMWORD PTR[GLOBAL(_2217_neg5352)] ;d1*2217 - c1*5352 michael@0: paddd xmm3, XMMWORD PTR[GLOBAL(_12000)] michael@0: paddd xmm4, XMMWORD PTR[GLOBAL(_51000)] michael@0: packssdw xmm0, xmm1 ;op[8] op[0] michael@0: psrad xmm3, 16 ;(c1 * 2217 + d1 * 5352 + 12000)>>16 michael@0: psrad xmm4, 16 ;(d1 * 2217 - c1 * 5352 + 51000)>>16 michael@0: michael@0: packssdw xmm3, xmm4 ;op[12] op[4] michael@0: movdqa xmm1, xmm0 michael@0: paddw xmm3, xmm2 ;op[4] += (d1!=0) michael@0: punpcklqdq xmm0, xmm3 ;op[4] op[0] michael@0: punpckhqdq xmm1, xmm3 ;op[12] op[8] michael@0: michael@0: movdqa XMMWORD PTR[output + 0], xmm0 michael@0: movdqa XMMWORD PTR[output + 16], xmm1 michael@0: michael@0: STACK_FRAME_DESTROY michael@0: michael@0: ;void vp8_short_fdct8x4_sse2(short *input, short *output, int pitch) michael@0: global sym(vp8_short_fdct8x4_sse2) PRIVATE michael@0: sym(vp8_short_fdct8x4_sse2): michael@0: michael@0: STACK_FRAME_CREATE michael@0: michael@0: ; read the input data michael@0: movdqa xmm0, [input ] michael@0: movdqa xmm2, [input+ pitch] michael@0: lea input, [input+2*pitch] michael@0: movdqa xmm4, [input ] michael@0: movdqa xmm3, [input+ pitch] michael@0: michael@0: ; transpose for the first stage michael@0: movdqa xmm1, xmm0 ; 00 01 02 03 04 05 06 07 michael@0: movdqa xmm5, xmm4 ; 20 21 22 23 24 25 26 27 michael@0: michael@0: punpcklwd xmm0, xmm2 ; 00 10 01 11 02 12 03 13 michael@0: punpckhwd xmm1, xmm2 ; 04 14 05 15 06 16 07 17 michael@0: michael@0: punpcklwd xmm4, xmm3 ; 20 30 21 31 22 32 23 33 michael@0: punpckhwd xmm5, xmm3 ; 24 34 25 35 26 36 27 37 michael@0: michael@0: movdqa xmm2, xmm0 ; 00 10 01 11 02 12 03 13 michael@0: punpckldq xmm0, xmm4 ; 00 10 20 30 01 11 21 31 michael@0: michael@0: punpckhdq xmm2, xmm4 ; 02 12 22 32 03 13 23 33 michael@0: michael@0: movdqa xmm4, xmm1 ; 04 14 05 15 06 16 07 17 michael@0: punpckldq xmm4, xmm5 ; 04 14 24 34 05 15 25 35 michael@0: michael@0: punpckhdq xmm1, xmm5 ; 06 16 26 36 07 17 27 37 michael@0: movdqa xmm3, xmm2 ; 02 12 22 32 03 13 23 33 michael@0: michael@0: punpckhqdq xmm3, xmm1 ; 03 13 23 33 07 17 27 37 michael@0: punpcklqdq xmm2, xmm1 ; 02 12 22 32 06 16 26 36 michael@0: michael@0: movdqa xmm1, xmm0 ; 00 10 20 30 01 11 21 31 michael@0: punpcklqdq xmm0, xmm4 ; 00 10 20 30 04 14 24 34 michael@0: michael@0: punpckhqdq xmm1, xmm4 ; 01 11 21 32 05 15 25 35 michael@0: michael@0: ; xmm0 0 michael@0: ; xmm1 1 michael@0: ; xmm2 2 michael@0: ; xmm3 3 michael@0: michael@0: ; first stage michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm4, xmm1 michael@0: michael@0: paddw xmm0, xmm3 ; a1 = 0 + 3 michael@0: paddw xmm1, xmm2 ; b1 = 1 + 2 michael@0: michael@0: psubw xmm4, xmm2 ; c1 = 1 - 2 michael@0: psubw xmm5, xmm3 ; d1 = 0 - 3 michael@0: michael@0: psllw xmm5, 3 michael@0: psllw xmm4, 3 michael@0: michael@0: psllw xmm0, 3 michael@0: psllw xmm1, 3 michael@0: michael@0: ; output 0 and 2 michael@0: movdqa xmm2, xmm0 ; a1 michael@0: michael@0: paddw xmm0, xmm1 ; op[0] = a1 + b1 michael@0: psubw xmm2, xmm1 ; op[2] = a1 - b1 michael@0: michael@0: ; output 1 and 3 michael@0: ; interleave c1, d1 michael@0: movdqa xmm1, xmm5 ; d1 michael@0: punpcklwd xmm1, xmm4 ; c1 d1 michael@0: punpckhwd xmm5, xmm4 ; c1 d1 michael@0: michael@0: movdqa xmm3, xmm1 michael@0: movdqa xmm4, xmm5 michael@0: michael@0: pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: michael@0: pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: michael@0: paddd xmm1, XMMWORD PTR[GLOBAL(_14500)] michael@0: paddd xmm4, XMMWORD PTR[GLOBAL(_14500)] michael@0: paddd xmm3, XMMWORD PTR[GLOBAL(_7500)] michael@0: paddd xmm5, XMMWORD PTR[GLOBAL(_7500)] michael@0: michael@0: psrad xmm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 michael@0: psrad xmm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 michael@0: psrad xmm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 michael@0: psrad xmm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 michael@0: michael@0: packssdw xmm1, xmm4 ; op[1] michael@0: packssdw xmm3, xmm5 ; op[3] michael@0: michael@0: ; done with vertical michael@0: ; transpose for the second stage michael@0: movdqa xmm4, xmm0 ; 00 10 20 30 04 14 24 34 michael@0: movdqa xmm5, xmm2 ; 02 12 22 32 06 16 26 36 michael@0: michael@0: punpcklwd xmm0, xmm1 ; 00 01 10 11 20 21 30 31 michael@0: punpckhwd xmm4, xmm1 ; 04 05 14 15 24 25 34 35 michael@0: michael@0: punpcklwd xmm2, xmm3 ; 02 03 12 13 22 23 32 33 michael@0: punpckhwd xmm5, xmm3 ; 06 07 16 17 26 27 36 37 michael@0: michael@0: movdqa xmm1, xmm0 ; 00 01 10 11 20 21 30 31 michael@0: punpckldq xmm0, xmm2 ; 00 01 02 03 10 11 12 13 michael@0: michael@0: punpckhdq xmm1, xmm2 ; 20 21 22 23 30 31 32 33 michael@0: michael@0: movdqa xmm2, xmm4 ; 04 05 14 15 24 25 34 35 michael@0: punpckldq xmm2, xmm5 ; 04 05 06 07 14 15 16 17 michael@0: michael@0: punpckhdq xmm4, xmm5 ; 24 25 26 27 34 35 36 37 michael@0: movdqa xmm3, xmm1 ; 20 21 22 23 30 31 32 33 michael@0: michael@0: punpckhqdq xmm3, xmm4 ; 30 31 32 33 34 35 36 37 michael@0: punpcklqdq xmm1, xmm4 ; 20 21 22 23 24 25 26 27 michael@0: michael@0: movdqa xmm4, xmm0 ; 00 01 02 03 10 11 12 13 michael@0: punpcklqdq xmm0, xmm2 ; 00 01 02 03 04 05 06 07 michael@0: michael@0: punpckhqdq xmm4, xmm2 ; 10 11 12 13 14 15 16 17 michael@0: michael@0: ; xmm0 0 michael@0: ; xmm1 4 michael@0: ; xmm2 1 michael@0: ; xmm3 3 michael@0: michael@0: movdqa xmm5, xmm0 michael@0: movdqa xmm2, xmm1 michael@0: michael@0: paddw xmm0, xmm3 ; a1 = 0 + 3 michael@0: paddw xmm1, xmm4 ; b1 = 1 + 2 michael@0: michael@0: psubw xmm4, xmm2 ; c1 = 1 - 2 michael@0: psubw xmm5, xmm3 ; d1 = 0 - 3 michael@0: michael@0: pxor xmm6, xmm6 ; zero out for compare michael@0: michael@0: pcmpeqw xmm6, xmm5 ; d1 != 0 michael@0: michael@0: pandn xmm6, XMMWORD PTR[GLOBAL(_cmp_mask8x4)] ; clear upper, michael@0: ; and keep bit 0 of lower michael@0: michael@0: ; output 0 and 2 michael@0: movdqa xmm2, xmm0 ; a1 michael@0: michael@0: paddw xmm0, xmm1 ; a1 + b1 michael@0: psubw xmm2, xmm1 ; a1 - b1 michael@0: michael@0: paddw xmm0, XMMWORD PTR[GLOBAL(_7w)] michael@0: paddw xmm2, XMMWORD PTR[GLOBAL(_7w)] michael@0: michael@0: psraw xmm0, 4 ; op[0] = (a1 + b1 + 7)>>4 michael@0: psraw xmm2, 4 ; op[8] = (a1 - b1 + 7)>>4 michael@0: michael@0: ; output 1 and 3 michael@0: ; interleave c1, d1 michael@0: movdqa xmm1, xmm5 ; d1 michael@0: punpcklwd xmm1, xmm4 ; c1 d1 michael@0: punpckhwd xmm5, xmm4 ; c1 d1 michael@0: michael@0: movdqa xmm3, xmm1 michael@0: movdqa xmm4, xmm5 michael@0: michael@0: pmaddwd xmm1, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: pmaddwd xmm4, XMMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: michael@0: pmaddwd xmm3, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: pmaddwd xmm5, XMMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: michael@0: paddd xmm1, XMMWORD PTR[GLOBAL(_12000)] michael@0: paddd xmm4, XMMWORD PTR[GLOBAL(_12000)] michael@0: paddd xmm3, XMMWORD PTR[GLOBAL(_51000)] michael@0: paddd xmm5, XMMWORD PTR[GLOBAL(_51000)] michael@0: michael@0: psrad xmm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 michael@0: psrad xmm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 michael@0: psrad xmm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 michael@0: psrad xmm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 michael@0: michael@0: packssdw xmm1, xmm4 ; op[4] michael@0: packssdw xmm3, xmm5 ; op[12] michael@0: michael@0: paddw xmm1, xmm6 ; op[4] += (d1!=0) michael@0: michael@0: movdqa xmm4, xmm0 michael@0: movdqa xmm5, xmm2 michael@0: michael@0: punpcklqdq xmm0, xmm1 michael@0: punpckhqdq xmm4, xmm1 michael@0: michael@0: punpcklqdq xmm2, xmm3 michael@0: punpckhqdq xmm5, xmm3 michael@0: michael@0: movdqa XMMWORD PTR[output + 0 ], xmm0 michael@0: movdqa XMMWORD PTR[output + 16], xmm2 michael@0: movdqa XMMWORD PTR[output + 32], xmm4 michael@0: movdqa XMMWORD PTR[output + 48], xmm5 michael@0: michael@0: STACK_FRAME_DESTROY michael@0: michael@0: SECTION_RODATA michael@0: align 16 michael@0: _5352_2217: michael@0: dw 5352 michael@0: dw 2217 michael@0: dw 5352 michael@0: dw 2217 michael@0: dw 5352 michael@0: dw 2217 michael@0: dw 5352 michael@0: dw 2217 michael@0: align 16 michael@0: _2217_neg5352: michael@0: dw 2217 michael@0: dw -5352 michael@0: dw 2217 michael@0: dw -5352 michael@0: dw 2217 michael@0: dw -5352 michael@0: dw 2217 michael@0: dw -5352 michael@0: align 16 michael@0: _mult_add: michael@0: times 8 dw 1 michael@0: align 16 michael@0: _cmp_mask: michael@0: times 4 dw 1 michael@0: times 4 dw 0 michael@0: align 16 michael@0: _cmp_mask8x4: michael@0: times 8 dw 1 michael@0: align 16 michael@0: _mult_sub: michael@0: dw 1 michael@0: dw -1 michael@0: dw 1 michael@0: dw -1 michael@0: dw 1 michael@0: dw -1 michael@0: dw 1 michael@0: dw -1 michael@0: align 16 michael@0: _7: michael@0: times 4 dd 7 michael@0: align 16 michael@0: _7w: michael@0: times 8 dw 7 michael@0: align 16 michael@0: _14500: michael@0: times 4 dd 14500 michael@0: align 16 michael@0: _7500: michael@0: times 4 dd 7500 michael@0: align 16 michael@0: _12000: michael@0: times 4 dd 12000 michael@0: align 16 michael@0: _51000: michael@0: times 4 dd 51000