michael@0: ; michael@0: ; Copyright (c) 2010 The WebM project authors. All Rights Reserved. michael@0: ; michael@0: ; Use of this source code is governed by a BSD-style license michael@0: ; that can be found in the LICENSE file in the root of the source michael@0: ; tree. An additional intellectual property rights grant can be found michael@0: ; in the file PATENTS. All contributing project authors may michael@0: ; be found in the AUTHORS file in the root of the source tree. michael@0: ; michael@0: michael@0: michael@0: %include "vpx_ports/x86_abi_support.asm" michael@0: michael@0: ;void vp8_short_fdct4x4_mmx(short *input, short *output, int pitch) michael@0: global sym(vp8_short_fdct4x4_mmx) PRIVATE michael@0: sym(vp8_short_fdct4x4_mmx): michael@0: push rbp michael@0: mov rbp, rsp michael@0: SHADOW_ARGS_TO_STACK 3 michael@0: GET_GOT rbx michael@0: push rsi michael@0: push rdi michael@0: ; end prolog michael@0: michael@0: mov rsi, arg(0) ; input michael@0: mov rdi, arg(1) ; output michael@0: michael@0: movsxd rax, dword ptr arg(2) ;pitch michael@0: michael@0: lea rcx, [rsi + rax*2] michael@0: ; read the input data michael@0: movq mm0, [rsi] michael@0: movq mm1, [rsi + rax] michael@0: michael@0: movq mm2, [rcx] michael@0: movq mm4, [rcx + rax] michael@0: michael@0: ; transpose for the first stage michael@0: movq mm3, mm0 ; 00 01 02 03 michael@0: movq mm5, mm2 ; 20 21 22 23 michael@0: michael@0: punpcklwd mm0, mm1 ; 00 10 01 11 michael@0: punpckhwd mm3, mm1 ; 02 12 03 13 michael@0: michael@0: punpcklwd mm2, mm4 ; 20 30 21 31 michael@0: punpckhwd mm5, mm4 ; 22 32 23 33 michael@0: michael@0: movq mm1, mm0 ; 00 10 01 11 michael@0: punpckldq mm0, mm2 ; 00 10 20 30 michael@0: michael@0: punpckhdq mm1, mm2 ; 01 11 21 31 michael@0: michael@0: movq mm2, mm3 ; 02 12 03 13 michael@0: punpckldq mm2, mm5 ; 02 12 22 32 michael@0: michael@0: punpckhdq mm3, mm5 ; 03 13 23 33 michael@0: michael@0: ; mm0 0 michael@0: ; mm1 1 michael@0: ; mm2 2 michael@0: ; mm3 3 michael@0: michael@0: ; first stage michael@0: movq mm5, mm0 michael@0: movq mm4, mm1 michael@0: michael@0: paddw mm0, mm3 ; a1 = 0 + 3 michael@0: paddw mm1, mm2 ; b1 = 1 + 2 michael@0: michael@0: psubw mm4, mm2 ; c1 = 1 - 2 michael@0: psubw mm5, mm3 ; d1 = 0 - 3 michael@0: michael@0: psllw mm5, 3 michael@0: psllw mm4, 3 michael@0: michael@0: psllw mm0, 3 michael@0: psllw mm1, 3 michael@0: michael@0: ; output 0 and 2 michael@0: movq mm2, mm0 ; a1 michael@0: michael@0: paddw mm0, mm1 ; op[0] = a1 + b1 michael@0: psubw mm2, mm1 ; op[2] = a1 - b1 michael@0: michael@0: ; output 1 and 3 michael@0: ; interleave c1, d1 michael@0: movq mm1, mm5 ; d1 michael@0: punpcklwd mm1, mm4 ; c1 d1 michael@0: punpckhwd mm5, mm4 ; c1 d1 michael@0: michael@0: movq mm3, mm1 michael@0: movq mm4, mm5 michael@0: michael@0: pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: michael@0: pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: michael@0: paddd mm1, MMWORD PTR[GLOBAL(_14500)] michael@0: paddd mm4, MMWORD PTR[GLOBAL(_14500)] michael@0: paddd mm3, MMWORD PTR[GLOBAL(_7500)] michael@0: paddd mm5, MMWORD PTR[GLOBAL(_7500)] michael@0: michael@0: psrad mm1, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 michael@0: psrad mm4, 12 ; (c1 * 2217 + d1 * 5352 + 14500)>>12 michael@0: psrad mm3, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 michael@0: psrad mm5, 12 ; (d1 * 2217 - c1 * 5352 + 7500)>>12 michael@0: michael@0: packssdw mm1, mm4 ; op[1] michael@0: packssdw mm3, mm5 ; op[3] michael@0: michael@0: ; done with vertical michael@0: ; transpose for the second stage michael@0: movq mm4, mm0 ; 00 10 20 30 michael@0: movq mm5, mm2 ; 02 12 22 32 michael@0: michael@0: punpcklwd mm0, mm1 ; 00 01 10 11 michael@0: punpckhwd mm4, mm1 ; 20 21 30 31 michael@0: michael@0: punpcklwd mm2, mm3 ; 02 03 12 13 michael@0: punpckhwd mm5, mm3 ; 22 23 32 33 michael@0: michael@0: movq mm1, mm0 ; 00 01 10 11 michael@0: punpckldq mm0, mm2 ; 00 01 02 03 michael@0: michael@0: punpckhdq mm1, mm2 ; 01 22 12 13 michael@0: michael@0: movq mm2, mm4 ; 20 31 30 31 michael@0: punpckldq mm2, mm5 ; 20 21 22 23 michael@0: michael@0: punpckhdq mm4, mm5 ; 30 31 32 33 michael@0: michael@0: ; mm0 0 michael@0: ; mm1 1 michael@0: ; mm2 2 michael@0: ; mm3 4 michael@0: michael@0: movq mm5, mm0 michael@0: movq mm3, mm1 michael@0: michael@0: paddw mm0, mm4 ; a1 = 0 + 3 michael@0: paddw mm1, mm2 ; b1 = 1 + 2 michael@0: michael@0: psubw mm3, mm2 ; c1 = 1 - 2 michael@0: psubw mm5, mm4 ; d1 = 0 - 3 michael@0: michael@0: pxor mm6, mm6 ; zero out for compare michael@0: michael@0: pcmpeqw mm6, mm5 ; d1 != 0 michael@0: michael@0: pandn mm6, MMWORD PTR[GLOBAL(_cmp_mask)] ; clear upper, michael@0: ; and keep bit 0 of lower michael@0: michael@0: ; output 0 and 2 michael@0: movq mm2, mm0 ; a1 michael@0: michael@0: paddw mm0, mm1 ; a1 + b1 michael@0: psubw mm2, mm1 ; a1 - b1 michael@0: michael@0: paddw mm0, MMWORD PTR[GLOBAL(_7w)] michael@0: paddw mm2, MMWORD PTR[GLOBAL(_7w)] michael@0: michael@0: psraw mm0, 4 ; op[0] = (a1 + b1 + 7)>>4 michael@0: psraw mm2, 4 ; op[8] = (a1 - b1 + 7)>>4 michael@0: michael@0: movq MMWORD PTR[rdi + 0 ], mm0 michael@0: movq MMWORD PTR[rdi + 16], mm2 michael@0: michael@0: ; output 1 and 3 michael@0: ; interleave c1, d1 michael@0: movq mm1, mm5 ; d1 michael@0: punpcklwd mm1, mm3 ; c1 d1 michael@0: punpckhwd mm5, mm3 ; c1 d1 michael@0: michael@0: movq mm3, mm1 michael@0: movq mm4, mm5 michael@0: michael@0: pmaddwd mm1, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: pmaddwd mm4, MMWORD PTR[GLOBAL (_5352_2217)] ; c1*2217 + d1*5352 michael@0: michael@0: pmaddwd mm3, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: pmaddwd mm5, MMWORD PTR[GLOBAL(_2217_neg5352)] ; d1*2217 - c1*5352 michael@0: michael@0: paddd mm1, MMWORD PTR[GLOBAL(_12000)] michael@0: paddd mm4, MMWORD PTR[GLOBAL(_12000)] michael@0: paddd mm3, MMWORD PTR[GLOBAL(_51000)] michael@0: paddd mm5, MMWORD PTR[GLOBAL(_51000)] michael@0: michael@0: psrad mm1, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 michael@0: psrad mm4, 16 ; (c1 * 2217 + d1 * 5352 + 14500)>>16 michael@0: psrad mm3, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 michael@0: psrad mm5, 16 ; (d1 * 2217 - c1 * 5352 + 7500)>>16 michael@0: michael@0: packssdw mm1, mm4 ; op[4] michael@0: packssdw mm3, mm5 ; op[12] michael@0: michael@0: paddw mm1, mm6 ; op[4] += (d1!=0) michael@0: michael@0: movq MMWORD PTR[rdi + 8 ], mm1 michael@0: movq MMWORD PTR[rdi + 24], mm3 michael@0: michael@0: ; begin epilog michael@0: pop rdi michael@0: pop rsi michael@0: RESTORE_GOT michael@0: UNSHADOW_ARGS michael@0: pop rbp michael@0: ret michael@0: michael@0: SECTION_RODATA michael@0: align 8 michael@0: _5352_2217: michael@0: dw 5352 michael@0: dw 2217 michael@0: dw 5352 michael@0: dw 2217 michael@0: align 8 michael@0: _2217_neg5352: michael@0: dw 2217 michael@0: dw -5352 michael@0: dw 2217 michael@0: dw -5352 michael@0: align 8 michael@0: _cmp_mask: michael@0: times 4 dw 1 michael@0: align 8 michael@0: _7w: michael@0: times 4 dw 7 michael@0: align 8 michael@0: _14500: michael@0: times 2 dd 14500 michael@0: align 8 michael@0: _7500: michael@0: times 2 dd 7500 michael@0: align 8 michael@0: _12000: michael@0: times 2 dd 12000 michael@0: align 8 michael@0: _51000: michael@0: times 2 dd 51000