Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /******************************************************************** |
michael@0 | 2 | * * |
michael@0 | 3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
michael@0 | 4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
michael@0 | 5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
michael@0 | 6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
michael@0 | 7 | * * |
michael@0 | 8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
michael@0 | 9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
michael@0 | 10 | * * |
michael@0 | 11 | ******************************************************************** |
michael@0 | 12 | |
michael@0 | 13 | function: |
michael@0 | 14 | last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ |
michael@0 | 15 | |
michael@0 | 16 | ********************************************************************/ |
michael@0 | 17 | |
michael@0 | 18 | /*SSE2 acceleration of Theora's iDCT.*/ |
michael@0 | 19 | #include "x86int.h" |
michael@0 | 20 | #include "sse2trans.h" |
michael@0 | 21 | #include "../dct.h" |
michael@0 | 22 | |
michael@0 | 23 | #if defined(OC_X86_ASM) |
michael@0 | 24 | |
michael@0 | 25 | /*A table of constants used by the MMX routines.*/ |
michael@0 | 26 | const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ |
michael@0 | 27 | 8, 8, 8, 8, 8, 8, 8, 8, |
michael@0 | 28 | OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, |
michael@0 | 29 | OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, |
michael@0 | 30 | OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, |
michael@0 | 31 | OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, |
michael@0 | 32 | OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, |
michael@0 | 33 | OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, |
michael@0 | 34 | OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 |
michael@0 | 35 | }; |
michael@0 | 36 | |
michael@0 | 37 | |
michael@0 | 38 | /*Performs the first three stages of the iDCT. |
michael@0 | 39 | xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input |
michael@0 | 40 | (accessed in that order). |
michael@0 | 41 | The remaining rows must be in _x at their corresponding locations. |
michael@0 | 42 | On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
michael@0 | 43 | contain rows 4 through 7.*/ |
michael@0 | 44 | #define OC_IDCT_8x8_ABC(_x) \ |
michael@0 | 45 | "#OC_IDCT_8x8_ABC\n\t" \ |
michael@0 | 46 | /*Stage 1:*/ \ |
michael@0 | 47 | /*2-3 rotation by 6pi/16. \ |
michael@0 | 48 | xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ |
michael@0 | 49 | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ |
michael@0 | 50 | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ |
michael@0 | 51 | "movdqa %%xmm1,%%xmm0\n\t" \ |
michael@0 | 52 | "pmulhw %%xmm2,%%xmm1\n\t" \ |
michael@0 | 53 | "movdqa %%xmm4,%%xmm7\n\t" \ |
michael@0 | 54 | "pmulhw %%xmm6,%%xmm0\n\t" \ |
michael@0 | 55 | "pmulhw %%xmm2,%%xmm7\n\t" \ |
michael@0 | 56 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
michael@0 | 57 | "paddw %%xmm6,%%xmm0\n\t" \ |
michael@0 | 58 | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ |
michael@0 | 59 | "paddw %%xmm1,%%xmm2\n\t" \ |
michael@0 | 60 | "psubw %%xmm0,%%xmm7\n\t" \ |
michael@0 | 61 | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
michael@0 | 62 | "paddw %%xmm4,%%xmm2\n\t" \ |
michael@0 | 63 | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ |
michael@0 | 64 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
michael@0 | 65 | /*5-6 rotation by 3pi/16. \ |
michael@0 | 66 | xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ |
michael@0 | 67 | "movdqa %%xmm4,%%xmm2\n\t" \ |
michael@0 | 68 | "movdqa %%xmm6,%%xmm1\n\t" \ |
michael@0 | 69 | "pmulhw %%xmm3,%%xmm4\n\t" \ |
michael@0 | 70 | "pmulhw %%xmm5,%%xmm1\n\t" \ |
michael@0 | 71 | "pmulhw %%xmm3,%%xmm6\n\t" \ |
michael@0 | 72 | "pmulhw %%xmm5,%%xmm2\n\t" \ |
michael@0 | 73 | "paddw %%xmm3,%%xmm4\n\t" \ |
michael@0 | 74 | "paddw %%xmm5,%%xmm3\n\t" \ |
michael@0 | 75 | "paddw %%xmm6,%%xmm3\n\t" \ |
michael@0 | 76 | "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ |
michael@0 | 77 | "paddw %%xmm5,%%xmm1\n\t" \ |
michael@0 | 78 | "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ |
michael@0 | 79 | "paddw %%xmm3,%%xmm2\n\t" \ |
michael@0 | 80 | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
michael@0 | 81 | "psubw %%xmm4,%%xmm1\n\t" \ |
michael@0 | 82 | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ |
michael@0 | 83 | /*4-7 rotation by 7pi/16. \ |
michael@0 | 84 | xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ |
michael@0 | 85 | "movdqa %%xmm3,%%xmm0\n\t" \ |
michael@0 | 86 | "movdqa %%xmm4,%%xmm7\n\t" \ |
michael@0 | 87 | "pmulhw %%xmm5,%%xmm3\n\t" \ |
michael@0 | 88 | "pmulhw %%xmm5,%%xmm7\n\t" \ |
michael@0 | 89 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
michael@0 | 90 | "pmulhw %%xmm6,%%xmm0\n\t" \ |
michael@0 | 91 | "paddw %%xmm6,%%xmm4\n\t" \ |
michael@0 | 92 | "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ |
michael@0 | 93 | "paddw %%xmm5,%%xmm7\n\t" \ |
michael@0 | 94 | "psubw %%xmm4,%%xmm3\n\t" \ |
michael@0 | 95 | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
michael@0 | 96 | "paddw %%xmm7,%%xmm0\n\t" \ |
michael@0 | 97 | "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ |
michael@0 | 98 | /*0-1 butterfly. \ |
michael@0 | 99 | xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ |
michael@0 | 100 | "paddw %%xmm7,%%xmm6\n\t" \ |
michael@0 | 101 | "movdqa %%xmm4,%%xmm5\n\t" \ |
michael@0 | 102 | "pmulhw %%xmm6,%%xmm4\n\t" \ |
michael@0 | 103 | "paddw %%xmm7,%%xmm7\n\t" \ |
michael@0 | 104 | "psubw %%xmm6,%%xmm7\n\t" \ |
michael@0 | 105 | "paddw %%xmm6,%%xmm4\n\t" \ |
michael@0 | 106 | /*Stage 2:*/ \ |
michael@0 | 107 | /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ |
michael@0 | 108 | 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ |
michael@0 | 109 | "movdqa %%xmm3,%%xmm6\n\t" \ |
michael@0 | 110 | "paddw %%xmm1,%%xmm3\n\t" \ |
michael@0 | 111 | "psubw %%xmm1,%%xmm6\n\t" \ |
michael@0 | 112 | "movdqa %%xmm5,%%xmm1\n\t" \ |
michael@0 | 113 | "pmulhw %%xmm7,%%xmm5\n\t" \ |
michael@0 | 114 | "paddw %%xmm7,%%xmm5\n\t" \ |
michael@0 | 115 | "movdqa %%xmm0,%%xmm7\n\t" \ |
michael@0 | 116 | "paddw %%xmm2,%%xmm0\n\t" \ |
michael@0 | 117 | "psubw %%xmm2,%%xmm7\n\t" \ |
michael@0 | 118 | "movdqa %%xmm1,%%xmm2\n\t" \ |
michael@0 | 119 | "pmulhw %%xmm6,%%xmm1\n\t" \ |
michael@0 | 120 | "pmulhw %%xmm7,%%xmm2\n\t" \ |
michael@0 | 121 | "paddw %%xmm6,%%xmm1\n\t" \ |
michael@0 | 122 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
michael@0 | 123 | "paddw %%xmm7,%%xmm2\n\t" \ |
michael@0 | 124 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
michael@0 | 125 | /*Stage 3: \ |
michael@0 | 126 | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
michael@0 | 127 | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
michael@0 | 128 | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
michael@0 | 129 | "paddw %%xmm2,%%xmm1\n\t" \ |
michael@0 | 130 | "paddw %%xmm5,%%xmm6\n\t" \ |
michael@0 | 131 | "paddw %%xmm4,%%xmm7\n\t" \ |
michael@0 | 132 | "paddw %%xmm2,%%xmm2\n\t" \ |
michael@0 | 133 | "paddw %%xmm4,%%xmm4\n\t" \ |
michael@0 | 134 | "paddw %%xmm5,%%xmm5\n\t" \ |
michael@0 | 135 | "psubw %%xmm1,%%xmm2\n\t" \ |
michael@0 | 136 | "psubw %%xmm7,%%xmm4\n\t" \ |
michael@0 | 137 | "psubw %%xmm6,%%xmm5\n\t" \ |
michael@0 | 138 | |
michael@0 | 139 | /*Performs the last stage of the iDCT. |
michael@0 | 140 | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
michael@0 | 141 | contain rows 4 through 7. |
michael@0 | 142 | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
michael@0 | 143 | #define OC_IDCT_8x8_D \ |
michael@0 | 144 | "#OC_IDCT_8x8_D\n\t" \ |
michael@0 | 145 | /*Stage 4: \ |
michael@0 | 146 | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
michael@0 | 147 | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
michael@0 | 148 | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
michael@0 | 149 | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
michael@0 | 150 | "psubw %%xmm0,%%xmm7\n\t" \ |
michael@0 | 151 | "psubw %%xmm1,%%xmm6\n\t" \ |
michael@0 | 152 | "psubw %%xmm2,%%xmm5\n\t" \ |
michael@0 | 153 | "psubw %%xmm3,%%xmm4\n\t" \ |
michael@0 | 154 | "paddw %%xmm0,%%xmm0\n\t" \ |
michael@0 | 155 | "paddw %%xmm1,%%xmm1\n\t" \ |
michael@0 | 156 | "paddw %%xmm2,%%xmm2\n\t" \ |
michael@0 | 157 | "paddw %%xmm3,%%xmm3\n\t" \ |
michael@0 | 158 | "paddw %%xmm7,%%xmm0\n\t" \ |
michael@0 | 159 | "paddw %%xmm6,%%xmm1\n\t" \ |
michael@0 | 160 | "paddw %%xmm5,%%xmm2\n\t" \ |
michael@0 | 161 | "paddw %%xmm4,%%xmm3\n\t" \ |
michael@0 | 162 | |
michael@0 | 163 | /*Performs the last stage of the iDCT. |
michael@0 | 164 | On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
michael@0 | 165 | contain rows 4 through 7. |
michael@0 | 166 | On output, xmm0 through xmm7 contain the corresponding rows.*/ |
michael@0 | 167 | #define OC_IDCT_8x8_D_STORE \ |
michael@0 | 168 | "#OC_IDCT_8x8_D_STORE\n\t" \ |
michael@0 | 169 | /*Stage 4: \ |
michael@0 | 170 | 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
michael@0 | 171 | 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
michael@0 | 172 | 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
michael@0 | 173 | 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
michael@0 | 174 | "psubw %%xmm3,%%xmm4\n\t" \ |
michael@0 | 175 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
michael@0 | 176 | "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ |
michael@0 | 177 | "psubw %%xmm0,%%xmm7\n\t" \ |
michael@0 | 178 | "psubw %%xmm1,%%xmm6\n\t" \ |
michael@0 | 179 | "psubw %%xmm2,%%xmm5\n\t" \ |
michael@0 | 180 | "paddw %%xmm4,%%xmm7\n\t" \ |
michael@0 | 181 | "paddw %%xmm4,%%xmm6\n\t" \ |
michael@0 | 182 | "paddw %%xmm4,%%xmm5\n\t" \ |
michael@0 | 183 | "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ |
michael@0 | 184 | "paddw %%xmm0,%%xmm0\n\t" \ |
michael@0 | 185 | "paddw %%xmm1,%%xmm1\n\t" \ |
michael@0 | 186 | "paddw %%xmm2,%%xmm2\n\t" \ |
michael@0 | 187 | "paddw %%xmm3,%%xmm3\n\t" \ |
michael@0 | 188 | "paddw %%xmm7,%%xmm0\n\t" \ |
michael@0 | 189 | "paddw %%xmm6,%%xmm1\n\t" \ |
michael@0 | 190 | "psraw $4,%%xmm0\n\t" \ |
michael@0 | 191 | "paddw %%xmm5,%%xmm2\n\t" \ |
michael@0 | 192 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ |
michael@0 | 193 | "psraw $4,%%xmm1\n\t" \ |
michael@0 | 194 | "paddw %%xmm4,%%xmm3\n\t" \ |
michael@0 | 195 | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ |
michael@0 | 196 | "psraw $4,%%xmm2\n\t" \ |
michael@0 | 197 | "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ |
michael@0 | 198 | "psraw $4,%%xmm3\n\t" \ |
michael@0 | 199 | "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ |
michael@0 | 200 | "psraw $4,%%xmm4\n\t" \ |
michael@0 | 201 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
michael@0 | 202 | "psraw $4,%%xmm5\n\t" \ |
michael@0 | 203 | "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ |
michael@0 | 204 | "psraw $4,%%xmm6\n\t" \ |
michael@0 | 205 | "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ |
michael@0 | 206 | "psraw $4,%%xmm7\n\t" \ |
michael@0 | 207 | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ |
michael@0 | 208 | |
michael@0 | 209 | static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
michael@0 | 210 | OC_ALIGN16(ogg_int16_t buf[16]); |
michael@0 | 211 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
michael@0 | 212 | __asm__ __volatile__( |
michael@0 | 213 | /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ |
michael@0 | 214 | "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" |
michael@0 | 215 | "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" |
michael@0 | 216 | "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" |
michael@0 | 217 | "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" |
michael@0 | 218 | OC_IDCT_8x8_ABC(x) |
michael@0 | 219 | OC_IDCT_8x8_D |
michael@0 | 220 | OC_TRANSPOSE_8x8 |
michael@0 | 221 | /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ |
michael@0 | 222 | "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" |
michael@0 | 223 | "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" |
michael@0 | 224 | "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" |
michael@0 | 225 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" |
michael@0 | 226 | OC_IDCT_8x8_ABC(y) |
michael@0 | 227 | OC_IDCT_8x8_D_STORE |
michael@0 | 228 | :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), |
michael@0 | 229 | [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
michael@0 | 230 | :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), |
michael@0 | 231 | [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
michael@0 | 232 | ); |
michael@0 | 233 | if(_x!=_y){ |
michael@0 | 234 | int i; |
michael@0 | 235 | __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); |
michael@0 | 236 | /*Clear input data for next block (decoder only).*/ |
michael@0 | 237 | for(i=0;i<2;i++){ |
michael@0 | 238 | __asm__ __volatile__( |
michael@0 | 239 | "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
michael@0 | 240 | "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
michael@0 | 241 | "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
michael@0 | 242 | "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
michael@0 | 243 | :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) |
michael@0 | 244 | ); |
michael@0 | 245 | } |
michael@0 | 246 | } |
michael@0 | 247 | } |
michael@0 | 248 | |
michael@0 | 249 | /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only |
michael@0 | 250 | need to work with four columns at a time. |
michael@0 | 251 | Doing this in MMX is faster on processors with a 64-bit data path.*/ |
michael@0 | 252 | #define OC_IDCT_8x8_10_MMX \ |
michael@0 | 253 | "#OC_IDCT_8x8_10_MMX\n\t" \ |
michael@0 | 254 | /*Stage 1:*/ \ |
michael@0 | 255 | /*2-3 rotation by 6pi/16. \ |
michael@0 | 256 | mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ |
michael@0 | 257 | "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
michael@0 | 258 | "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ |
michael@0 | 259 | "pmulhw %%mm2,%%mm6\n\t" \ |
michael@0 | 260 | "pmulhw %%mm2,%%mm7\n\t" \ |
michael@0 | 261 | "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ |
michael@0 | 262 | "paddw %%mm6,%%mm2\n\t" \ |
michael@0 | 263 | "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
michael@0 | 264 | "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ |
michael@0 | 265 | "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
michael@0 | 266 | /*5-6 rotation by 3pi/16. \ |
michael@0 | 267 | mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ |
michael@0 | 268 | "pmulhw %%mm3,%%mm5\n\t" \ |
michael@0 | 269 | "pmulhw %%mm3,%%mm2\n\t" \ |
michael@0 | 270 | "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ |
michael@0 | 271 | "paddw %%mm3,%%mm5\n\t" \ |
michael@0 | 272 | "paddw %%mm3,%%mm2\n\t" \ |
michael@0 | 273 | "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
michael@0 | 274 | /*4-7 rotation by 7pi/16. \ |
michael@0 | 275 | mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ |
michael@0 | 276 | "pmulhw %%mm1,%%mm3\n\t" \ |
michael@0 | 277 | "pmulhw %%mm1,%%mm7\n\t" \ |
michael@0 | 278 | "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
michael@0 | 279 | "movq %%mm3,%%mm6\n\t" \ |
michael@0 | 280 | "paddw %%mm1,%%mm7\n\t" \ |
michael@0 | 281 | /*0-1 butterfly. \ |
michael@0 | 282 | mm4=C4, mm0=X0, X4=0.*/ \ |
michael@0 | 283 | /*Stage 2:*/ \ |
michael@0 | 284 | /*4-5 butterfly: mm3=t[4], mm5=t[5] \ |
michael@0 | 285 | 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ |
michael@0 | 286 | "psubw %%mm5,%%mm3\n\t" \ |
michael@0 | 287 | "paddw %%mm5,%%mm6\n\t" \ |
michael@0 | 288 | "movq %%mm4,%%mm1\n\t" \ |
michael@0 | 289 | "pmulhw %%mm0,%%mm4\n\t" \ |
michael@0 | 290 | "paddw %%mm0,%%mm4\n\t" \ |
michael@0 | 291 | "movq %%mm7,%%mm0\n\t" \ |
michael@0 | 292 | "movq %%mm4,%%mm5\n\t" \ |
michael@0 | 293 | "paddw %%mm2,%%mm0\n\t" \ |
michael@0 | 294 | "psubw %%mm2,%%mm7\n\t" \ |
michael@0 | 295 | "movq %%mm1,%%mm2\n\t" \ |
michael@0 | 296 | "pmulhw %%mm6,%%mm1\n\t" \ |
michael@0 | 297 | "pmulhw %%mm7,%%mm2\n\t" \ |
michael@0 | 298 | "paddw %%mm6,%%mm1\n\t" \ |
michael@0 | 299 | "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ |
michael@0 | 300 | "paddw %%mm7,%%mm2\n\t" \ |
michael@0 | 301 | "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ |
michael@0 | 302 | /*Stage 3: \ |
michael@0 | 303 | 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ |
michael@0 | 304 | 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ |
michael@0 | 305 | 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ |
michael@0 | 306 | "paddw %%mm2,%%mm1\n\t" \ |
michael@0 | 307 | "paddw %%mm5,%%mm6\n\t" \ |
michael@0 | 308 | "paddw %%mm4,%%mm7\n\t" \ |
michael@0 | 309 | "paddw %%mm2,%%mm2\n\t" \ |
michael@0 | 310 | "paddw %%mm4,%%mm4\n\t" \ |
michael@0 | 311 | "paddw %%mm5,%%mm5\n\t" \ |
michael@0 | 312 | "psubw %%mm1,%%mm2\n\t" \ |
michael@0 | 313 | "psubw %%mm7,%%mm4\n\t" \ |
michael@0 | 314 | "psubw %%mm6,%%mm5\n\t" \ |
michael@0 | 315 | /*Stage 4: \ |
michael@0 | 316 | 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ |
michael@0 | 317 | 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ |
michael@0 | 318 | 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ |
michael@0 | 319 | 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ |
michael@0 | 320 | "psubw %%mm0,%%mm7\n\t" \ |
michael@0 | 321 | "psubw %%mm1,%%mm6\n\t" \ |
michael@0 | 322 | "psubw %%mm2,%%mm5\n\t" \ |
michael@0 | 323 | "psubw %%mm3,%%mm4\n\t" \ |
michael@0 | 324 | "paddw %%mm0,%%mm0\n\t" \ |
michael@0 | 325 | "paddw %%mm1,%%mm1\n\t" \ |
michael@0 | 326 | "paddw %%mm2,%%mm2\n\t" \ |
michael@0 | 327 | "paddw %%mm3,%%mm3\n\t" \ |
michael@0 | 328 | "paddw %%mm7,%%mm0\n\t" \ |
michael@0 | 329 | "paddw %%mm6,%%mm1\n\t" \ |
michael@0 | 330 | "paddw %%mm5,%%mm2\n\t" \ |
michael@0 | 331 | "paddw %%mm4,%%mm3\n\t" \ |
michael@0 | 332 | |
michael@0 | 333 | #define OC_IDCT_8x8_10_ABC \ |
michael@0 | 334 | "#OC_IDCT_8x8_10_ABC\n\t" \ |
michael@0 | 335 | /*Stage 1:*/ \ |
michael@0 | 336 | /*2-3 rotation by 6pi/16. \ |
michael@0 | 337 | xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ |
michael@0 | 338 | "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ |
michael@0 | 339 | "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ |
michael@0 | 340 | "pmulhw %%xmm2,%%xmm6\n\t" \ |
michael@0 | 341 | "pmulhw %%xmm2,%%xmm7\n\t" \ |
michael@0 | 342 | "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ |
michael@0 | 343 | "paddw %%xmm6,%%xmm2\n\t" \ |
michael@0 | 344 | "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
michael@0 | 345 | "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ |
michael@0 | 346 | "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
michael@0 | 347 | /*5-6 rotation by 3pi/16. \ |
michael@0 | 348 | xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ |
michael@0 | 349 | "pmulhw %%xmm3,%%xmm5\n\t" \ |
michael@0 | 350 | "pmulhw %%xmm3,%%xmm2\n\t" \ |
michael@0 | 351 | "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ |
michael@0 | 352 | "paddw %%xmm3,%%xmm5\n\t" \ |
michael@0 | 353 | "paddw %%xmm3,%%xmm2\n\t" \ |
michael@0 | 354 | "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
michael@0 | 355 | /*4-7 rotation by 7pi/16. \ |
michael@0 | 356 | xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ |
michael@0 | 357 | "pmulhw %%xmm1,%%xmm3\n\t" \ |
michael@0 | 358 | "pmulhw %%xmm1,%%xmm7\n\t" \ |
michael@0 | 359 | "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
michael@0 | 360 | "movdqa %%xmm3,%%xmm6\n\t" \ |
michael@0 | 361 | "paddw %%xmm1,%%xmm7\n\t" \ |
michael@0 | 362 | /*0-1 butterfly. \ |
michael@0 | 363 | xmm4=C4, xmm0=X0, X4=0.*/ \ |
michael@0 | 364 | /*Stage 2:*/ \ |
michael@0 | 365 | /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ |
michael@0 | 366 | 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ |
michael@0 | 367 | "psubw %%xmm5,%%xmm3\n\t" \ |
michael@0 | 368 | "paddw %%xmm5,%%xmm6\n\t" \ |
michael@0 | 369 | "movdqa %%xmm4,%%xmm1\n\t" \ |
michael@0 | 370 | "pmulhw %%xmm0,%%xmm4\n\t" \ |
michael@0 | 371 | "paddw %%xmm0,%%xmm4\n\t" \ |
michael@0 | 372 | "movdqa %%xmm7,%%xmm0\n\t" \ |
michael@0 | 373 | "movdqa %%xmm4,%%xmm5\n\t" \ |
michael@0 | 374 | "paddw %%xmm2,%%xmm0\n\t" \ |
michael@0 | 375 | "psubw %%xmm2,%%xmm7\n\t" \ |
michael@0 | 376 | "movdqa %%xmm1,%%xmm2\n\t" \ |
michael@0 | 377 | "pmulhw %%xmm6,%%xmm1\n\t" \ |
michael@0 | 378 | "pmulhw %%xmm7,%%xmm2\n\t" \ |
michael@0 | 379 | "paddw %%xmm6,%%xmm1\n\t" \ |
michael@0 | 380 | "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
michael@0 | 381 | "paddw %%xmm7,%%xmm2\n\t" \ |
michael@0 | 382 | "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
michael@0 | 383 | /*Stage 3: \ |
michael@0 | 384 | 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
michael@0 | 385 | 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
michael@0 | 386 | 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
michael@0 | 387 | "paddw %%xmm2,%%xmm1\n\t" \ |
michael@0 | 388 | "paddw %%xmm5,%%xmm6\n\t" \ |
michael@0 | 389 | "paddw %%xmm4,%%xmm7\n\t" \ |
michael@0 | 390 | "paddw %%xmm2,%%xmm2\n\t" \ |
michael@0 | 391 | "paddw %%xmm4,%%xmm4\n\t" \ |
michael@0 | 392 | "paddw %%xmm5,%%xmm5\n\t" \ |
michael@0 | 393 | "psubw %%xmm1,%%xmm2\n\t" \ |
michael@0 | 394 | "psubw %%xmm7,%%xmm4\n\t" \ |
michael@0 | 395 | "psubw %%xmm6,%%xmm5\n\t" \ |
michael@0 | 396 | |
michael@0 | 397 | static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
michael@0 | 398 | OC_ALIGN16(ogg_int16_t buf[16]); |
michael@0 | 399 | /*This routine accepts an 8x8 matrix pre-transposed.*/ |
michael@0 | 400 | __asm__ __volatile__( |
michael@0 | 401 | "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" |
michael@0 | 402 | "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" |
michael@0 | 403 | "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" |
michael@0 | 404 | "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" |
michael@0 | 405 | OC_IDCT_8x8_10_MMX |
michael@0 | 406 | OC_TRANSPOSE_8x4_MMX2SSE |
michael@0 | 407 | OC_IDCT_8x8_10_ABC |
michael@0 | 408 | OC_IDCT_8x8_D_STORE |
michael@0 | 409 | :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), |
michael@0 | 410 | [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
michael@0 | 411 | :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
michael@0 | 412 | [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
michael@0 | 413 | ); |
michael@0 | 414 | if(_x!=_y){ |
michael@0 | 415 | /*Clear input data for next block (decoder only).*/ |
michael@0 | 416 | __asm__ __volatile__( |
michael@0 | 417 | "pxor %%mm0,%%mm0\n\t" |
michael@0 | 418 | "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
michael@0 | 419 | "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
michael@0 | 420 | "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
michael@0 | 421 | "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
michael@0 | 422 | :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) |
michael@0 | 423 | ); |
michael@0 | 424 | } |
michael@0 | 425 | } |
michael@0 | 426 | |
michael@0 | 427 | /*Performs an inverse 8x8 Type-II DCT transform. |
michael@0 | 428 | The input is assumed to be scaled by a factor of 4 relative to orthonormal |
michael@0 | 429 | version of the transform.*/ |
michael@0 | 430 | void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
michael@0 | 431 | /*_last_zzi is subtly different from an actual count of the number of |
michael@0 | 432 | coefficients we decoded for this block. |
michael@0 | 433 | It contains the value of zzi BEFORE the final token in the block was |
michael@0 | 434 | decoded. |
michael@0 | 435 | In most cases this is an EOB token (the continuation of an EOB run from a |
michael@0 | 436 | previous block counts), and so this is the same as the coefficient count. |
michael@0 | 437 | However, in the case that the last token was NOT an EOB token, but filled |
michael@0 | 438 | the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
michael@0 | 439 | Provided the last token was not a pure zero run, the minimum value it can |
michael@0 | 440 | be is 46, and so that doesn't affect any of the cases in this routine. |
michael@0 | 441 | However, if the last token WAS a pure zero run of length 63, then _last_zzi |
michael@0 | 442 | will be 1 while the number of coefficients decoded is 64. |
michael@0 | 443 | Thus, we will trigger the following special case, where the real |
michael@0 | 444 | coefficient count would not. |
michael@0 | 445 | Note also that a zero run of length 64 will give _last_zzi a value of 0, |
michael@0 | 446 | but we still process the DC coefficient, which might have a non-zero value |
michael@0 | 447 | due to DC prediction. |
michael@0 | 448 | Although convoluted, this is arguably the correct behavior: it allows us to |
michael@0 | 449 | use a smaller transform when the block ends with a long zero run instead |
michael@0 | 450 | of a normal EOB token. |
michael@0 | 451 | It could be smarter... multiple separate zero runs at the end of a block |
michael@0 | 452 | will fool it, but an encoder that generates these really deserves what it |
michael@0 | 453 | gets. |
michael@0 | 454 | Needless to say we inherited this approach from VP3.*/ |
michael@0 | 455 | /*Then perform the iDCT.*/ |
michael@0 | 456 | if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); |
michael@0 | 457 | else oc_idct8x8_slow_sse2(_y,_x); |
michael@0 | 458 | } |
michael@0 | 459 | |
michael@0 | 460 | #endif |