michael@0: /******************************************************************** michael@0: * * michael@0: * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * michael@0: * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * michael@0: * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * michael@0: * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * michael@0: * * michael@0: * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * michael@0: * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * michael@0: * * michael@0: ******************************************************************** michael@0: michael@0: function: michael@0: last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ michael@0: michael@0: ********************************************************************/ michael@0: michael@0: /*MMX acceleration of Theora's iDCT. michael@0: Originally written by Rudolf Marek, based on code from On2's VP3.*/ michael@0: #include "x86int.h" michael@0: #include "../dct.h" michael@0: michael@0: #if defined(OC_X86_ASM) michael@0: michael@0: /*These are offsets into the table of constants below.*/ michael@0: /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ michael@0: #define OC_COSINE_OFFSET (0) michael@0: /*A row of 8's.*/ michael@0: #define OC_EIGHT_OFFSET (56) michael@0: michael@0: michael@0: michael@0: /*38 cycles*/ michael@0: #define OC_IDCT_BEGIN(_y,_x) \ michael@0: "#OC_IDCT_BEGIN\n\t" \ michael@0: "movq "OC_I(3,_x)",%%mm2\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ michael@0: "movq %%mm2,%%mm4\n\t" \ michael@0: "movq "OC_J(5,_x)",%%mm7\n\t" \ michael@0: "pmulhw %%mm6,%%mm4\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ michael@0: "pmulhw %%mm7,%%mm6\n\t" \ michael@0: "movq %%mm1,%%mm5\n\t" \ michael@0: "pmulhw %%mm2,%%mm1\n\t" \ michael@0: "movq "OC_I(1,_x)",%%mm3\n\t" \ michael@0: "pmulhw %%mm7,%%mm5\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ michael@0: "paddw %%mm2,%%mm4\n\t" \ michael@0: "paddw %%mm7,%%mm6\n\t" \ michael@0: "paddw %%mm1,%%mm2\n\t" \ michael@0: "movq "OC_J(7,_x)",%%mm1\n\t" \ michael@0: "paddw %%mm5,%%mm7\n\t" \ michael@0: "movq %%mm0,%%mm5\n\t" \ michael@0: "pmulhw %%mm3,%%mm0\n\t" \ michael@0: "paddw %%mm7,%%mm4\n\t" \ michael@0: "pmulhw %%mm1,%%mm5\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ michael@0: "psubw %%mm2,%%mm6\n\t" \ michael@0: "paddw %%mm3,%%mm0\n\t" \ michael@0: "pmulhw %%mm7,%%mm3\n\t" \ michael@0: "movq "OC_I(2,_x)",%%mm2\n\t" \ michael@0: "pmulhw %%mm1,%%mm7\n\t" \ michael@0: "paddw %%mm1,%%mm5\n\t" \ michael@0: "movq %%mm2,%%mm1\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ michael@0: "psubw %%mm5,%%mm3\n\t" \ michael@0: "movq "OC_J(6,_x)",%%mm5\n\t" \ michael@0: "paddw %%mm7,%%mm0\n\t" \ michael@0: "movq %%mm5,%%mm7\n\t" \ michael@0: "psubw %%mm4,%%mm0\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ michael@0: "paddw %%mm1,%%mm2\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ michael@0: "paddw %%mm4,%%mm4\n\t" \ michael@0: "paddw %%mm0,%%mm4\n\t" \ michael@0: "psubw %%mm6,%%mm3\n\t" \ michael@0: "paddw %%mm7,%%mm5\n\t" \ michael@0: "paddw %%mm6,%%mm6\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ michael@0: "paddw %%mm3,%%mm6\n\t" \ michael@0: "movq %%mm4,"OC_I(1,_y)"\n\t" \ michael@0: "psubw %%mm5,%%mm1\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ michael@0: "movq %%mm3,%%mm5\n\t" \ michael@0: "pmulhw %%mm4,%%mm3\n\t" \ michael@0: "paddw %%mm2,%%mm7\n\t" \ michael@0: "movq %%mm6,"OC_I(2,_y)"\n\t" \ michael@0: "movq %%mm0,%%mm2\n\t" \ michael@0: "movq "OC_I(0,_x)",%%mm6\n\t" \ michael@0: "pmulhw %%mm4,%%mm0\n\t" \ michael@0: "paddw %%mm3,%%mm5\n\t" \ michael@0: "movq "OC_J(4,_x)",%%mm3\n\t" \ michael@0: "psubw %%mm1,%%mm5\n\t" \ michael@0: "paddw %%mm0,%%mm2\n\t" \ michael@0: "psubw %%mm3,%%mm6\n\t" \ michael@0: "movq %%mm6,%%mm0\n\t" \ michael@0: "pmulhw %%mm4,%%mm6\n\t" \ michael@0: "paddw %%mm3,%%mm3\n\t" \ michael@0: "paddw %%mm1,%%mm1\n\t" \ michael@0: "paddw %%mm0,%%mm3\n\t" \ michael@0: "paddw %%mm5,%%mm1\n\t" \ michael@0: "pmulhw %%mm3,%%mm4\n\t" \ michael@0: "paddw %%mm0,%%mm6\n\t" \ michael@0: "psubw %%mm2,%%mm6\n\t" \ michael@0: "paddw %%mm2,%%mm2\n\t" \ michael@0: "movq "OC_I(1,_y)",%%mm0\n\t" \ michael@0: "paddw %%mm6,%%mm2\n\t" \ michael@0: "paddw %%mm3,%%mm4\n\t" \ michael@0: "psubw %%mm1,%%mm2\n\t" \ michael@0: "#end OC_IDCT_BEGIN\n\t" \ michael@0: michael@0: /*38+8=46 cycles.*/ michael@0: #define OC_ROW_IDCT(_y,_x) \ michael@0: "#OC_ROW_IDCT\n" \ michael@0: OC_IDCT_BEGIN(_y,_x) \ michael@0: /*r3=D'*/ \ michael@0: "movq "OC_I(2,_y)",%%mm3\n\t" \ michael@0: /*r4=E'=E-G*/ \ michael@0: "psubw %%mm7,%%mm4\n\t" \ michael@0: /*r1=H'+H'*/ \ michael@0: "paddw %%mm1,%%mm1\n\t" \ michael@0: /*r7=G+G*/ \ michael@0: "paddw %%mm7,%%mm7\n\t" \ michael@0: /*r1=R1=A''+H'*/ \ michael@0: "paddw %%mm2,%%mm1\n\t" \ michael@0: /*r7=G'=E+G*/ \ michael@0: "paddw %%mm4,%%mm7\n\t" \ michael@0: /*r4=R4=E'-D'*/ \ michael@0: "psubw %%mm3,%%mm4\n\t" \ michael@0: "paddw %%mm3,%%mm3\n\t" \ michael@0: /*r6=R6=F'-B''*/ \ michael@0: "psubw %%mm5,%%mm6\n\t" \ michael@0: "paddw %%mm5,%%mm5\n\t" \ michael@0: /*r3=R3=E'+D'*/ \ michael@0: "paddw %%mm4,%%mm3\n\t" \ michael@0: /*r5=R5=F'+B''*/ \ michael@0: "paddw %%mm6,%%mm5\n\t" \ michael@0: /*r7=R7=G'-C'*/ \ michael@0: "psubw %%mm0,%%mm7\n\t" \ michael@0: "paddw %%mm0,%%mm0\n\t" \ michael@0: /*Save R1.*/ \ michael@0: "movq %%mm1,"OC_I(1,_y)"\n\t" \ michael@0: /*r0=R0=G.+C.*/ \ michael@0: "paddw %%mm7,%%mm0\n\t" \ michael@0: "#end OC_ROW_IDCT\n\t" \ michael@0: michael@0: /*The following macro does two 4x4 transposes in place. michael@0: At entry, we assume: michael@0: r0 = a3 a2 a1 a0 michael@0: I(1) = b3 b2 b1 b0 michael@0: r2 = c3 c2 c1 c0 michael@0: r3 = d3 d2 d1 d0 michael@0: michael@0: r4 = e3 e2 e1 e0 michael@0: r5 = f3 f2 f1 f0 michael@0: r6 = g3 g2 g1 g0 michael@0: r7 = h3 h2 h1 h0 michael@0: michael@0: At exit, we have: michael@0: I(0) = d0 c0 b0 a0 michael@0: I(1) = d1 c1 b1 a1 michael@0: I(2) = d2 c2 b2 a2 michael@0: I(3) = d3 c3 b3 a3 michael@0: michael@0: J(4) = h0 g0 f0 e0 michael@0: J(5) = h1 g1 f1 e1 michael@0: J(6) = h2 g2 f2 e2 michael@0: J(7) = h3 g3 f3 e3 michael@0: michael@0: I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. michael@0: J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. michael@0: michael@0: Since r1 is free at entry, we calculate the Js first.*/ michael@0: /*19 cycles.*/ michael@0: #define OC_TRANSPOSE(_y) \ michael@0: "#OC_TRANSPOSE\n\t" \ michael@0: "movq %%mm4,%%mm1\n\t" \ michael@0: "punpcklwd %%mm5,%%mm4\n\t" \ michael@0: "movq %%mm0,"OC_I(0,_y)"\n\t" \ michael@0: "punpckhwd %%mm5,%%mm1\n\t" \ michael@0: "movq %%mm6,%%mm0\n\t" \ michael@0: "punpcklwd %%mm7,%%mm6\n\t" \ michael@0: "movq %%mm4,%%mm5\n\t" \ michael@0: "punpckldq %%mm6,%%mm4\n\t" \ michael@0: "punpckhdq %%mm6,%%mm5\n\t" \ michael@0: "movq %%mm1,%%mm6\n\t" \ michael@0: "movq %%mm4,"OC_J(4,_y)"\n\t" \ michael@0: "punpckhwd %%mm7,%%mm0\n\t" \ michael@0: "movq %%mm5,"OC_J(5,_y)"\n\t" \ michael@0: "punpckhdq %%mm0,%%mm6\n\t" \ michael@0: "movq "OC_I(0,_y)",%%mm4\n\t" \ michael@0: "punpckldq %%mm0,%%mm1\n\t" \ michael@0: "movq "OC_I(1,_y)",%%mm5\n\t" \ michael@0: "movq %%mm4,%%mm0\n\t" \ michael@0: "movq %%mm6,"OC_J(7,_y)"\n\t" \ michael@0: "punpcklwd %%mm5,%%mm0\n\t" \ michael@0: "movq %%mm1,"OC_J(6,_y)"\n\t" \ michael@0: "punpckhwd %%mm5,%%mm4\n\t" \ michael@0: "movq %%mm2,%%mm5\n\t" \ michael@0: "punpcklwd %%mm3,%%mm2\n\t" \ michael@0: "movq %%mm0,%%mm1\n\t" \ michael@0: "punpckldq %%mm2,%%mm0\n\t" \ michael@0: "punpckhdq %%mm2,%%mm1\n\t" \ michael@0: "movq %%mm4,%%mm2\n\t" \ michael@0: "movq %%mm0,"OC_I(0,_y)"\n\t" \ michael@0: "punpckhwd %%mm3,%%mm5\n\t" \ michael@0: "movq %%mm1,"OC_I(1,_y)"\n\t" \ michael@0: "punpckhdq %%mm5,%%mm4\n\t" \ michael@0: "punpckldq %%mm5,%%mm2\n\t" \ michael@0: "movq %%mm4,"OC_I(3,_y)"\n\t" \ michael@0: "movq %%mm2,"OC_I(2,_y)"\n\t" \ michael@0: "#end OC_TRANSPOSE\n\t" \ michael@0: michael@0: /*38+19=57 cycles.*/ michael@0: #define OC_COLUMN_IDCT(_y) \ michael@0: "#OC_COLUMN_IDCT\n" \ michael@0: OC_IDCT_BEGIN(_y,_y) \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ michael@0: /*r1=H'+H'*/ \ michael@0: "paddw %%mm1,%%mm1\n\t" \ michael@0: /*r1=R1=A''+H'*/ \ michael@0: "paddw %%mm2,%%mm1\n\t" \ michael@0: /*r2=NR2*/ \ michael@0: "psraw $4,%%mm2\n\t" \ michael@0: /*r4=E'=E-G*/ \ michael@0: "psubw %%mm7,%%mm4\n\t" \ michael@0: /*r1=NR1*/ \ michael@0: "psraw $4,%%mm1\n\t" \ michael@0: /*r3=D'*/ \ michael@0: "movq "OC_I(2,_y)",%%mm3\n\t" \ michael@0: /*r7=G+G*/ \ michael@0: "paddw %%mm7,%%mm7\n\t" \ michael@0: /*Store NR2 at I(2).*/ \ michael@0: "movq %%mm2,"OC_I(2,_y)"\n\t" \ michael@0: /*r7=G'=E+G*/ \ michael@0: "paddw %%mm4,%%mm7\n\t" \ michael@0: /*Store NR1 at I(1).*/ \ michael@0: "movq %%mm1,"OC_I(1,_y)"\n\t" \ michael@0: /*r4=R4=E'-D'*/ \ michael@0: "psubw %%mm3,%%mm4\n\t" \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ michael@0: /*r3=D'+D'*/ \ michael@0: "paddw %%mm3,%%mm3\n\t" \ michael@0: /*r3=R3=E'+D'*/ \ michael@0: "paddw %%mm4,%%mm3\n\t" \ michael@0: /*r4=NR4*/ \ michael@0: "psraw $4,%%mm4\n\t" \ michael@0: /*r6=R6=F'-B''*/ \ michael@0: "psubw %%mm5,%%mm6\n\t" \ michael@0: /*r3=NR3*/ \ michael@0: "psraw $4,%%mm3\n\t" \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ michael@0: /*r5=B''+B''*/ \ michael@0: "paddw %%mm5,%%mm5\n\t" \ michael@0: /*r5=R5=F'+B''*/ \ michael@0: "paddw %%mm6,%%mm5\n\t" \ michael@0: /*r6=NR6*/ \ michael@0: "psraw $4,%%mm6\n\t" \ michael@0: /*Store NR4 at J(4).*/ \ michael@0: "movq %%mm4,"OC_J(4,_y)"\n\t" \ michael@0: /*r5=NR5*/ \ michael@0: "psraw $4,%%mm5\n\t" \ michael@0: /*Store NR3 at I(3).*/ \ michael@0: "movq %%mm3,"OC_I(3,_y)"\n\t" \ michael@0: /*r7=R7=G'-C'*/ \ michael@0: "psubw %%mm0,%%mm7\n\t" \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ michael@0: /*r0=C'+C'*/ \ michael@0: "paddw %%mm0,%%mm0\n\t" \ michael@0: /*r0=R0=G'+C'*/ \ michael@0: "paddw %%mm7,%%mm0\n\t" \ michael@0: /*r7=NR7*/ \ michael@0: "psraw $4,%%mm7\n\t" \ michael@0: /*Store NR6 at J(6).*/ \ michael@0: "movq %%mm6,"OC_J(6,_y)"\n\t" \ michael@0: /*r0=NR0*/ \ michael@0: "psraw $4,%%mm0\n\t" \ michael@0: /*Store NR5 at J(5).*/ \ michael@0: "movq %%mm5,"OC_J(5,_y)"\n\t" \ michael@0: /*Store NR7 at J(7).*/ \ michael@0: "movq %%mm7,"OC_J(7,_y)"\n\t" \ michael@0: /*Store NR0 at I(0).*/ \ michael@0: "movq %%mm0,"OC_I(0,_y)"\n\t" \ michael@0: "#end OC_COLUMN_IDCT\n\t" \ michael@0: michael@0: static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ michael@0: /*This routine accepts an 8x8 matrix, but in partially transposed form. michael@0: Every 4x4 block is transposed.*/ michael@0: __asm__ __volatile__( michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) michael@0: #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) michael@0: OC_ROW_IDCT(y,x) michael@0: OC_TRANSPOSE(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) michael@0: #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) michael@0: OC_ROW_IDCT(y,x) michael@0: OC_TRANSPOSE(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) michael@0: #define OC_J(_k,_y) OC_I(_k,_y) michael@0: OC_COLUMN_IDCT(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) michael@0: #define OC_J(_k,_y) OC_I(_k,_y) michael@0: OC_COLUMN_IDCT(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) michael@0: :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), michael@0: [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) michael@0: ); michael@0: if(_x!=_y){ michael@0: int i; michael@0: __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); michael@0: for(i=0;i<4;i++){ michael@0: __asm__ __volatile__( michael@0: "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" michael@0: :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) michael@0: ); michael@0: } michael@0: } michael@0: } michael@0: michael@0: /*25 cycles.*/ michael@0: #define OC_IDCT_BEGIN_10(_y,_x) \ michael@0: "#OC_IDCT_BEGIN_10\n\t" \ michael@0: "movq "OC_I(3,_x)",%%mm2\n\t" \ michael@0: "nop\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ michael@0: "movq %%mm2,%%mm4\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ michael@0: "pmulhw %%mm6,%%mm4\n\t" \ michael@0: "movq "OC_I(1,_x)",%%mm3\n\t" \ michael@0: "pmulhw %%mm2,%%mm1\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ michael@0: "paddw %%mm2,%%mm4\n\t" \ michael@0: "pxor %%mm6,%%mm6\n\t" \ michael@0: "paddw %%mm1,%%mm2\n\t" \ michael@0: "movq "OC_I(2,_x)",%%mm5\n\t" \ michael@0: "pmulhw %%mm3,%%mm0\n\t" \ michael@0: "movq %%mm5,%%mm1\n\t" \ michael@0: "paddw %%mm3,%%mm0\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ michael@0: "psubw %%mm2,%%mm6\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ michael@0: "psubw %%mm4,%%mm0\n\t" \ michael@0: "movq "OC_I(2,_x)",%%mm7\n\t" \ michael@0: "paddw %%mm4,%%mm4\n\t" \ michael@0: "paddw %%mm5,%%mm7\n\t" \ michael@0: "paddw %%mm0,%%mm4\n\t" \ michael@0: "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ michael@0: "psubw %%mm6,%%mm3\n\t" \ michael@0: "movq %%mm4,"OC_I(1,_y)"\n\t" \ michael@0: "paddw %%mm6,%%mm6\n\t" \ michael@0: "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ michael@0: "paddw %%mm3,%%mm6\n\t" \ michael@0: "movq %%mm3,%%mm5\n\t" \ michael@0: "pmulhw %%mm4,%%mm3\n\t" \ michael@0: "movq %%mm6,"OC_I(2,_y)"\n\t" \ michael@0: "movq %%mm0,%%mm2\n\t" \ michael@0: "movq "OC_I(0,_x)",%%mm6\n\t" \ michael@0: "pmulhw %%mm4,%%mm0\n\t" \ michael@0: "paddw %%mm3,%%mm5\n\t" \ michael@0: "paddw %%mm0,%%mm2\n\t" \ michael@0: "psubw %%mm1,%%mm5\n\t" \ michael@0: "pmulhw %%mm4,%%mm6\n\t" \ michael@0: "paddw "OC_I(0,_x)",%%mm6\n\t" \ michael@0: "paddw %%mm1,%%mm1\n\t" \ michael@0: "movq %%mm6,%%mm4\n\t" \ michael@0: "paddw %%mm5,%%mm1\n\t" \ michael@0: "psubw %%mm2,%%mm6\n\t" \ michael@0: "paddw %%mm2,%%mm2\n\t" \ michael@0: "movq "OC_I(1,_y)",%%mm0\n\t" \ michael@0: "paddw %%mm6,%%mm2\n\t" \ michael@0: "psubw %%mm1,%%mm2\n\t" \ michael@0: "nop\n\t" \ michael@0: "#end OC_IDCT_BEGIN_10\n\t" \ michael@0: michael@0: /*25+8=33 cycles.*/ michael@0: #define OC_ROW_IDCT_10(_y,_x) \ michael@0: "#OC_ROW_IDCT_10\n\t" \ michael@0: OC_IDCT_BEGIN_10(_y,_x) \ michael@0: /*r3=D'*/ \ michael@0: "movq "OC_I(2,_y)",%%mm3\n\t" \ michael@0: /*r4=E'=E-G*/ \ michael@0: "psubw %%mm7,%%mm4\n\t" \ michael@0: /*r1=H'+H'*/ \ michael@0: "paddw %%mm1,%%mm1\n\t" \ michael@0: /*r7=G+G*/ \ michael@0: "paddw %%mm7,%%mm7\n\t" \ michael@0: /*r1=R1=A''+H'*/ \ michael@0: "paddw %%mm2,%%mm1\n\t" \ michael@0: /*r7=G'=E+G*/ \ michael@0: "paddw %%mm4,%%mm7\n\t" \ michael@0: /*r4=R4=E'-D'*/ \ michael@0: "psubw %%mm3,%%mm4\n\t" \ michael@0: "paddw %%mm3,%%mm3\n\t" \ michael@0: /*r6=R6=F'-B''*/ \ michael@0: "psubw %%mm5,%%mm6\n\t" \ michael@0: "paddw %%mm5,%%mm5\n\t" \ michael@0: /*r3=R3=E'+D'*/ \ michael@0: "paddw %%mm4,%%mm3\n\t" \ michael@0: /*r5=R5=F'+B''*/ \ michael@0: "paddw %%mm6,%%mm5\n\t" \ michael@0: /*r7=R7=G'-C'*/ \ michael@0: "psubw %%mm0,%%mm7\n\t" \ michael@0: "paddw %%mm0,%%mm0\n\t" \ michael@0: /*Save R1.*/ \ michael@0: "movq %%mm1,"OC_I(1,_y)"\n\t" \ michael@0: /*r0=R0=G'+C'*/ \ michael@0: "paddw %%mm7,%%mm0\n\t" \ michael@0: "#end OC_ROW_IDCT_10\n\t" \ michael@0: michael@0: /*25+19=44 cycles'*/ michael@0: #define OC_COLUMN_IDCT_10(_y) \ michael@0: "#OC_COLUMN_IDCT_10\n\t" \ michael@0: OC_IDCT_BEGIN_10(_y,_y) \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ michael@0: /*r1=H'+H'*/ \ michael@0: "paddw %%mm1,%%mm1\n\t" \ michael@0: /*r1=R1=A''+H'*/ \ michael@0: "paddw %%mm2,%%mm1\n\t" \ michael@0: /*r2=NR2*/ \ michael@0: "psraw $4,%%mm2\n\t" \ michael@0: /*r4=E'=E-G*/ \ michael@0: "psubw %%mm7,%%mm4\n\t" \ michael@0: /*r1=NR1*/ \ michael@0: "psraw $4,%%mm1\n\t" \ michael@0: /*r3=D'*/ \ michael@0: "movq "OC_I(2,_y)",%%mm3\n\t" \ michael@0: /*r7=G+G*/ \ michael@0: "paddw %%mm7,%%mm7\n\t" \ michael@0: /*Store NR2 at I(2).*/ \ michael@0: "movq %%mm2,"OC_I(2,_y)"\n\t" \ michael@0: /*r7=G'=E+G*/ \ michael@0: "paddw %%mm4,%%mm7\n\t" \ michael@0: /*Store NR1 at I(1).*/ \ michael@0: "movq %%mm1,"OC_I(1,_y)"\n\t" \ michael@0: /*r4=R4=E'-D'*/ \ michael@0: "psubw %%mm3,%%mm4\n\t" \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ michael@0: /*r3=D'+D'*/ \ michael@0: "paddw %%mm3,%%mm3\n\t" \ michael@0: /*r3=R3=E'+D'*/ \ michael@0: "paddw %%mm4,%%mm3\n\t" \ michael@0: /*r4=NR4*/ \ michael@0: "psraw $4,%%mm4\n\t" \ michael@0: /*r6=R6=F'-B''*/ \ michael@0: "psubw %%mm5,%%mm6\n\t" \ michael@0: /*r3=NR3*/ \ michael@0: "psraw $4,%%mm3\n\t" \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ michael@0: /*r5=B''+B''*/ \ michael@0: "paddw %%mm5,%%mm5\n\t" \ michael@0: /*r5=R5=F'+B''*/ \ michael@0: "paddw %%mm6,%%mm5\n\t" \ michael@0: /*r6=NR6*/ \ michael@0: "psraw $4,%%mm6\n\t" \ michael@0: /*Store NR4 at J(4).*/ \ michael@0: "movq %%mm4,"OC_J(4,_y)"\n\t" \ michael@0: /*r5=NR5*/ \ michael@0: "psraw $4,%%mm5\n\t" \ michael@0: /*Store NR3 at I(3).*/ \ michael@0: "movq %%mm3,"OC_I(3,_y)"\n\t" \ michael@0: /*r7=R7=G'-C'*/ \ michael@0: "psubw %%mm0,%%mm7\n\t" \ michael@0: "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ michael@0: /*r0=C'+C'*/ \ michael@0: "paddw %%mm0,%%mm0\n\t" \ michael@0: /*r0=R0=G'+C'*/ \ michael@0: "paddw %%mm7,%%mm0\n\t" \ michael@0: /*r7=NR7*/ \ michael@0: "psraw $4,%%mm7\n\t" \ michael@0: /*Store NR6 at J(6).*/ \ michael@0: "movq %%mm6,"OC_J(6,_y)"\n\t" \ michael@0: /*r0=NR0*/ \ michael@0: "psraw $4,%%mm0\n\t" \ michael@0: /*Store NR5 at J(5).*/ \ michael@0: "movq %%mm5,"OC_J(5,_y)"\n\t" \ michael@0: /*Store NR7 at J(7).*/ \ michael@0: "movq %%mm7,"OC_J(7,_y)"\n\t" \ michael@0: /*Store NR0 at I(0).*/ \ michael@0: "movq %%mm0,"OC_I(0,_y)"\n\t" \ michael@0: "#end OC_COLUMN_IDCT_10\n\t" \ michael@0: michael@0: static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ michael@0: __asm__ __volatile__( michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) michael@0: #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) michael@0: /*Done with dequant, descramble, and partial transpose. michael@0: Now do the iDCT itself.*/ michael@0: OC_ROW_IDCT_10(y,x) michael@0: OC_TRANSPOSE(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) michael@0: #define OC_J(_k,_y) OC_I(_k,_y) michael@0: OC_COLUMN_IDCT_10(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) michael@0: #define OC_J(_k,_y) OC_I(_k,_y) michael@0: OC_COLUMN_IDCT_10(y) michael@0: #undef OC_I michael@0: #undef OC_J michael@0: :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) michael@0: :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), michael@0: [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) michael@0: ); michael@0: if(_x!=_y){ michael@0: __asm__ __volatile__( michael@0: "pxor %%mm0,%%mm0\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" michael@0: :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) michael@0: ); michael@0: } michael@0: } michael@0: michael@0: /*Performs an inverse 8x8 Type-II DCT transform. michael@0: The input is assumed to be scaled by a factor of 4 relative to orthonormal michael@0: version of the transform.*/ michael@0: void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ michael@0: /*_last_zzi is subtly different from an actual count of the number of michael@0: coefficients we decoded for this block. michael@0: It contains the value of zzi BEFORE the final token in the block was michael@0: decoded. michael@0: In most cases this is an EOB token (the continuation of an EOB run from a michael@0: previous block counts), and so this is the same as the coefficient count. michael@0: However, in the case that the last token was NOT an EOB token, but filled michael@0: the block up with exactly 64 coefficients, _last_zzi will be less than 64. michael@0: Provided the last token was not a pure zero run, the minimum value it can michael@0: be is 46, and so that doesn't affect any of the cases in this routine. michael@0: However, if the last token WAS a pure zero run of length 63, then _last_zzi michael@0: will be 1 while the number of coefficients decoded is 64. michael@0: Thus, we will trigger the following special case, where the real michael@0: coefficient count would not. michael@0: Note also that a zero run of length 64 will give _last_zzi a value of 0, michael@0: but we still process the DC coefficient, which might have a non-zero value michael@0: due to DC prediction. michael@0: Although convoluted, this is arguably the correct behavior: it allows us to michael@0: use a smaller transform when the block ends with a long zero run instead michael@0: of a normal EOB token. michael@0: It could be smarter... multiple separate zero runs at the end of a block michael@0: will fool it, but an encoder that generates these really deserves what it michael@0: gets. michael@0: Needless to say we inherited this approach from VP3.*/ michael@0: /*Then perform the iDCT.*/ michael@0: if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); michael@0: else oc_idct8x8_slow_mmx(_y,_x); michael@0: } michael@0: michael@0: #endif