1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/x86/sse2idct.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,460 @@ 1.4 +/******************************************************************** 1.5 + * * 1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 + * * 1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * 1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 + * * 1.14 + ******************************************************************** 1.15 + 1.16 + function: 1.17 + last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ 1.18 + 1.19 + ********************************************************************/ 1.20 + 1.21 +/*SSE2 acceleration of Theora's iDCT.*/ 1.22 +#include "x86int.h" 1.23 +#include "sse2trans.h" 1.24 +#include "../dct.h" 1.25 + 1.26 +#if defined(OC_X86_ASM) 1.27 + 1.28 +/*A table of constants used by the MMX routines.*/ 1.29 +const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ 1.30 + 8, 8, 8, 8, 8, 8, 8, 8, 1.31 + OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, 1.32 + OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, 1.33 + OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, 1.34 + OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, 1.35 + OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, 1.36 + OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, 1.37 + OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 1.38 +}; 1.39 + 1.40 + 1.41 +/*Performs the first three stages of the iDCT. 1.42 + xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input 1.43 + (accessed in that order). 1.44 + The remaining rows must be in _x at their corresponding locations. 1.45 + On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 1.46 + contain rows 4 through 7.*/ 1.47 +#define OC_IDCT_8x8_ABC(_x) \ 1.48 + "#OC_IDCT_8x8_ABC\n\t" \ 1.49 + /*Stage 1:*/ \ 1.50 + /*2-3 rotation by 6pi/16. \ 1.51 + xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ 1.52 + "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ 1.53 + "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ 1.54 + "movdqa %%xmm1,%%xmm0\n\t" \ 1.55 + "pmulhw %%xmm2,%%xmm1\n\t" \ 1.56 + "movdqa %%xmm4,%%xmm7\n\t" \ 1.57 + "pmulhw %%xmm6,%%xmm0\n\t" \ 1.58 + "pmulhw %%xmm2,%%xmm7\n\t" \ 1.59 + "pmulhw %%xmm6,%%xmm4\n\t" \ 1.60 + "paddw %%xmm6,%%xmm0\n\t" \ 1.61 + "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ 1.62 + "paddw %%xmm1,%%xmm2\n\t" \ 1.63 + "psubw %%xmm0,%%xmm7\n\t" \ 1.64 + "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ 1.65 + "paddw %%xmm4,%%xmm2\n\t" \ 1.66 + "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ 1.67 + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ 1.68 + /*5-6 rotation by 3pi/16. \ 1.69 + xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ 1.70 + "movdqa %%xmm4,%%xmm2\n\t" \ 1.71 + "movdqa %%xmm6,%%xmm1\n\t" \ 1.72 + "pmulhw %%xmm3,%%xmm4\n\t" \ 1.73 + "pmulhw %%xmm5,%%xmm1\n\t" \ 1.74 + "pmulhw %%xmm3,%%xmm6\n\t" \ 1.75 + "pmulhw %%xmm5,%%xmm2\n\t" \ 1.76 + "paddw %%xmm3,%%xmm4\n\t" \ 1.77 + "paddw %%xmm5,%%xmm3\n\t" \ 1.78 + "paddw %%xmm6,%%xmm3\n\t" \ 1.79 + "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ 1.80 + "paddw %%xmm5,%%xmm1\n\t" \ 1.81 + "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ 1.82 + "paddw %%xmm3,%%xmm2\n\t" \ 1.83 + "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ 1.84 + "psubw %%xmm4,%%xmm1\n\t" \ 1.85 + "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ 1.86 + /*4-7 rotation by 7pi/16. \ 1.87 + xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ 1.88 + "movdqa %%xmm3,%%xmm0\n\t" \ 1.89 + "movdqa %%xmm4,%%xmm7\n\t" \ 1.90 + "pmulhw %%xmm5,%%xmm3\n\t" \ 1.91 + "pmulhw %%xmm5,%%xmm7\n\t" \ 1.92 + "pmulhw %%xmm6,%%xmm4\n\t" \ 1.93 + "pmulhw %%xmm6,%%xmm0\n\t" \ 1.94 + "paddw %%xmm6,%%xmm4\n\t" \ 1.95 + "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ 1.96 + "paddw %%xmm5,%%xmm7\n\t" \ 1.97 + "psubw %%xmm4,%%xmm3\n\t" \ 1.98 + "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ 1.99 + "paddw %%xmm7,%%xmm0\n\t" \ 1.100 + "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ 1.101 + /*0-1 butterfly. \ 1.102 + xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ 1.103 + "paddw %%xmm7,%%xmm6\n\t" \ 1.104 + "movdqa %%xmm4,%%xmm5\n\t" \ 1.105 + "pmulhw %%xmm6,%%xmm4\n\t" \ 1.106 + "paddw %%xmm7,%%xmm7\n\t" \ 1.107 + "psubw %%xmm6,%%xmm7\n\t" \ 1.108 + "paddw %%xmm6,%%xmm4\n\t" \ 1.109 + /*Stage 2:*/ \ 1.110 + /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ 1.111 + 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ 1.112 + "movdqa %%xmm3,%%xmm6\n\t" \ 1.113 + "paddw %%xmm1,%%xmm3\n\t" \ 1.114 + "psubw %%xmm1,%%xmm6\n\t" \ 1.115 + "movdqa %%xmm5,%%xmm1\n\t" \ 1.116 + "pmulhw %%xmm7,%%xmm5\n\t" \ 1.117 + "paddw %%xmm7,%%xmm5\n\t" \ 1.118 + "movdqa %%xmm0,%%xmm7\n\t" \ 1.119 + "paddw %%xmm2,%%xmm0\n\t" \ 1.120 + "psubw %%xmm2,%%xmm7\n\t" \ 1.121 + "movdqa %%xmm1,%%xmm2\n\t" \ 1.122 + "pmulhw %%xmm6,%%xmm1\n\t" \ 1.123 + "pmulhw %%xmm7,%%xmm2\n\t" \ 1.124 + "paddw %%xmm6,%%xmm1\n\t" \ 1.125 + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ 1.126 + "paddw %%xmm7,%%xmm2\n\t" \ 1.127 + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ 1.128 + /*Stage 3: \ 1.129 + 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ 1.130 + 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ 1.131 + 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ 1.132 + "paddw %%xmm2,%%xmm1\n\t" \ 1.133 + "paddw %%xmm5,%%xmm6\n\t" \ 1.134 + "paddw %%xmm4,%%xmm7\n\t" \ 1.135 + "paddw %%xmm2,%%xmm2\n\t" \ 1.136 + "paddw %%xmm4,%%xmm4\n\t" \ 1.137 + "paddw %%xmm5,%%xmm5\n\t" \ 1.138 + "psubw %%xmm1,%%xmm2\n\t" \ 1.139 + "psubw %%xmm7,%%xmm4\n\t" \ 1.140 + "psubw %%xmm6,%%xmm5\n\t" \ 1.141 + 1.142 +/*Performs the last stage of the iDCT. 1.143 + On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 1.144 + contain rows 4 through 7. 1.145 + On output, xmm0 through xmm7 contain the corresponding rows.*/ 1.146 +#define OC_IDCT_8x8_D \ 1.147 + "#OC_IDCT_8x8_D\n\t" \ 1.148 + /*Stage 4: \ 1.149 + 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ 1.150 + 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ 1.151 + 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ 1.152 + 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ 1.153 + "psubw %%xmm0,%%xmm7\n\t" \ 1.154 + "psubw %%xmm1,%%xmm6\n\t" \ 1.155 + "psubw %%xmm2,%%xmm5\n\t" \ 1.156 + "psubw %%xmm3,%%xmm4\n\t" \ 1.157 + "paddw %%xmm0,%%xmm0\n\t" \ 1.158 + "paddw %%xmm1,%%xmm1\n\t" \ 1.159 + "paddw %%xmm2,%%xmm2\n\t" \ 1.160 + "paddw %%xmm3,%%xmm3\n\t" \ 1.161 + "paddw %%xmm7,%%xmm0\n\t" \ 1.162 + "paddw %%xmm6,%%xmm1\n\t" \ 1.163 + "paddw %%xmm5,%%xmm2\n\t" \ 1.164 + "paddw %%xmm4,%%xmm3\n\t" \ 1.165 + 1.166 +/*Performs the last stage of the iDCT. 1.167 + On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 1.168 + contain rows 4 through 7. 1.169 + On output, xmm0 through xmm7 contain the corresponding rows.*/ 1.170 +#define OC_IDCT_8x8_D_STORE \ 1.171 + "#OC_IDCT_8x8_D_STORE\n\t" \ 1.172 + /*Stage 4: \ 1.173 + 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ 1.174 + 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ 1.175 + 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ 1.176 + 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ 1.177 + "psubw %%xmm3,%%xmm4\n\t" \ 1.178 + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ 1.179 + "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ 1.180 + "psubw %%xmm0,%%xmm7\n\t" \ 1.181 + "psubw %%xmm1,%%xmm6\n\t" \ 1.182 + "psubw %%xmm2,%%xmm5\n\t" \ 1.183 + "paddw %%xmm4,%%xmm7\n\t" \ 1.184 + "paddw %%xmm4,%%xmm6\n\t" \ 1.185 + "paddw %%xmm4,%%xmm5\n\t" \ 1.186 + "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ 1.187 + "paddw %%xmm0,%%xmm0\n\t" \ 1.188 + "paddw %%xmm1,%%xmm1\n\t" \ 1.189 + "paddw %%xmm2,%%xmm2\n\t" \ 1.190 + "paddw %%xmm3,%%xmm3\n\t" \ 1.191 + "paddw %%xmm7,%%xmm0\n\t" \ 1.192 + "paddw %%xmm6,%%xmm1\n\t" \ 1.193 + "psraw $4,%%xmm0\n\t" \ 1.194 + "paddw %%xmm5,%%xmm2\n\t" \ 1.195 + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ 1.196 + "psraw $4,%%xmm1\n\t" \ 1.197 + "paddw %%xmm4,%%xmm3\n\t" \ 1.198 + "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ 1.199 + "psraw $4,%%xmm2\n\t" \ 1.200 + "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ 1.201 + "psraw $4,%%xmm3\n\t" \ 1.202 + "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ 1.203 + "psraw $4,%%xmm4\n\t" \ 1.204 + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ 1.205 + "psraw $4,%%xmm5\n\t" \ 1.206 + "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ 1.207 + "psraw $4,%%xmm6\n\t" \ 1.208 + "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ 1.209 + "psraw $4,%%xmm7\n\t" \ 1.210 + "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ 1.211 + 1.212 +static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ 1.213 + OC_ALIGN16(ogg_int16_t buf[16]); 1.214 + /*This routine accepts an 8x8 matrix pre-transposed.*/ 1.215 + __asm__ __volatile__( 1.216 + /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ 1.217 + "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" 1.218 + "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" 1.219 + "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" 1.220 + "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" 1.221 + OC_IDCT_8x8_ABC(x) 1.222 + OC_IDCT_8x8_D 1.223 + OC_TRANSPOSE_8x8 1.224 + /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ 1.225 + "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" 1.226 + "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" 1.227 + "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" 1.228 + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" 1.229 + OC_IDCT_8x8_ABC(y) 1.230 + OC_IDCT_8x8_D_STORE 1.231 + :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), 1.232 + [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) 1.233 + :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), 1.234 + [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) 1.235 + ); 1.236 + if(_x!=_y){ 1.237 + int i; 1.238 + __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); 1.239 + /*Clear input data for next block (decoder only).*/ 1.240 + for(i=0;i<2;i++){ 1.241 + __asm__ __volatile__( 1.242 + "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" 1.243 + "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" 1.244 + "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" 1.245 + "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" 1.246 + :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) 1.247 + ); 1.248 + } 1.249 + } 1.250 +} 1.251 + 1.252 +/*For the first step of the 10-coefficient version of the 8x8 iDCT, we only 1.253 + need to work with four columns at a time. 1.254 + Doing this in MMX is faster on processors with a 64-bit data path.*/ 1.255 +#define OC_IDCT_8x8_10_MMX \ 1.256 + "#OC_IDCT_8x8_10_MMX\n\t" \ 1.257 + /*Stage 1:*/ \ 1.258 + /*2-3 rotation by 6pi/16. \ 1.259 + mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ 1.260 + "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ 1.261 + "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ 1.262 + "pmulhw %%mm2,%%mm6\n\t" \ 1.263 + "pmulhw %%mm2,%%mm7\n\t" \ 1.264 + "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ 1.265 + "paddw %%mm6,%%mm2\n\t" \ 1.266 + "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ 1.267 + "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ 1.268 + "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ 1.269 + /*5-6 rotation by 3pi/16. \ 1.270 + mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ 1.271 + "pmulhw %%mm3,%%mm5\n\t" \ 1.272 + "pmulhw %%mm3,%%mm2\n\t" \ 1.273 + "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ 1.274 + "paddw %%mm3,%%mm5\n\t" \ 1.275 + "paddw %%mm3,%%mm2\n\t" \ 1.276 + "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ 1.277 + /*4-7 rotation by 7pi/16. \ 1.278 + mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ 1.279 + "pmulhw %%mm1,%%mm3\n\t" \ 1.280 + "pmulhw %%mm1,%%mm7\n\t" \ 1.281 + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ 1.282 + "movq %%mm3,%%mm6\n\t" \ 1.283 + "paddw %%mm1,%%mm7\n\t" \ 1.284 + /*0-1 butterfly. \ 1.285 + mm4=C4, mm0=X0, X4=0.*/ \ 1.286 + /*Stage 2:*/ \ 1.287 + /*4-5 butterfly: mm3=t[4], mm5=t[5] \ 1.288 + 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ 1.289 + "psubw %%mm5,%%mm3\n\t" \ 1.290 + "paddw %%mm5,%%mm6\n\t" \ 1.291 + "movq %%mm4,%%mm1\n\t" \ 1.292 + "pmulhw %%mm0,%%mm4\n\t" \ 1.293 + "paddw %%mm0,%%mm4\n\t" \ 1.294 + "movq %%mm7,%%mm0\n\t" \ 1.295 + "movq %%mm4,%%mm5\n\t" \ 1.296 + "paddw %%mm2,%%mm0\n\t" \ 1.297 + "psubw %%mm2,%%mm7\n\t" \ 1.298 + "movq %%mm1,%%mm2\n\t" \ 1.299 + "pmulhw %%mm6,%%mm1\n\t" \ 1.300 + "pmulhw %%mm7,%%mm2\n\t" \ 1.301 + "paddw %%mm6,%%mm1\n\t" \ 1.302 + "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ 1.303 + "paddw %%mm7,%%mm2\n\t" \ 1.304 + "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ 1.305 + /*Stage 3: \ 1.306 + 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ 1.307 + 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ 1.308 + 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ 1.309 + "paddw %%mm2,%%mm1\n\t" \ 1.310 + "paddw %%mm5,%%mm6\n\t" \ 1.311 + "paddw %%mm4,%%mm7\n\t" \ 1.312 + "paddw %%mm2,%%mm2\n\t" \ 1.313 + "paddw %%mm4,%%mm4\n\t" \ 1.314 + "paddw %%mm5,%%mm5\n\t" \ 1.315 + "psubw %%mm1,%%mm2\n\t" \ 1.316 + "psubw %%mm7,%%mm4\n\t" \ 1.317 + "psubw %%mm6,%%mm5\n\t" \ 1.318 + /*Stage 4: \ 1.319 + 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ 1.320 + 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ 1.321 + 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ 1.322 + 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ 1.323 + "psubw %%mm0,%%mm7\n\t" \ 1.324 + "psubw %%mm1,%%mm6\n\t" \ 1.325 + "psubw %%mm2,%%mm5\n\t" \ 1.326 + "psubw %%mm3,%%mm4\n\t" \ 1.327 + "paddw %%mm0,%%mm0\n\t" \ 1.328 + "paddw %%mm1,%%mm1\n\t" \ 1.329 + "paddw %%mm2,%%mm2\n\t" \ 1.330 + "paddw %%mm3,%%mm3\n\t" \ 1.331 + "paddw %%mm7,%%mm0\n\t" \ 1.332 + "paddw %%mm6,%%mm1\n\t" \ 1.333 + "paddw %%mm5,%%mm2\n\t" \ 1.334 + "paddw %%mm4,%%mm3\n\t" \ 1.335 + 1.336 +#define OC_IDCT_8x8_10_ABC \ 1.337 + "#OC_IDCT_8x8_10_ABC\n\t" \ 1.338 + /*Stage 1:*/ \ 1.339 + /*2-3 rotation by 6pi/16. \ 1.340 + xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ 1.341 + "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ 1.342 + "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ 1.343 + "pmulhw %%xmm2,%%xmm6\n\t" \ 1.344 + "pmulhw %%xmm2,%%xmm7\n\t" \ 1.345 + "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ 1.346 + "paddw %%xmm6,%%xmm2\n\t" \ 1.347 + "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ 1.348 + "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ 1.349 + "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ 1.350 + /*5-6 rotation by 3pi/16. \ 1.351 + xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ 1.352 + "pmulhw %%xmm3,%%xmm5\n\t" \ 1.353 + "pmulhw %%xmm3,%%xmm2\n\t" \ 1.354 + "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ 1.355 + "paddw %%xmm3,%%xmm5\n\t" \ 1.356 + "paddw %%xmm3,%%xmm2\n\t" \ 1.357 + "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ 1.358 + /*4-7 rotation by 7pi/16. \ 1.359 + xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ 1.360 + "pmulhw %%xmm1,%%xmm3\n\t" \ 1.361 + "pmulhw %%xmm1,%%xmm7\n\t" \ 1.362 + "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ 1.363 + "movdqa %%xmm3,%%xmm6\n\t" \ 1.364 + "paddw %%xmm1,%%xmm7\n\t" \ 1.365 + /*0-1 butterfly. \ 1.366 + xmm4=C4, xmm0=X0, X4=0.*/ \ 1.367 + /*Stage 2:*/ \ 1.368 + /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ 1.369 + 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ 1.370 + "psubw %%xmm5,%%xmm3\n\t" \ 1.371 + "paddw %%xmm5,%%xmm6\n\t" \ 1.372 + "movdqa %%xmm4,%%xmm1\n\t" \ 1.373 + "pmulhw %%xmm0,%%xmm4\n\t" \ 1.374 + "paddw %%xmm0,%%xmm4\n\t" \ 1.375 + "movdqa %%xmm7,%%xmm0\n\t" \ 1.376 + "movdqa %%xmm4,%%xmm5\n\t" \ 1.377 + "paddw %%xmm2,%%xmm0\n\t" \ 1.378 + "psubw %%xmm2,%%xmm7\n\t" \ 1.379 + "movdqa %%xmm1,%%xmm2\n\t" \ 1.380 + "pmulhw %%xmm6,%%xmm1\n\t" \ 1.381 + "pmulhw %%xmm7,%%xmm2\n\t" \ 1.382 + "paddw %%xmm6,%%xmm1\n\t" \ 1.383 + "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ 1.384 + "paddw %%xmm7,%%xmm2\n\t" \ 1.385 + "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ 1.386 + /*Stage 3: \ 1.387 + 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ 1.388 + 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ 1.389 + 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ 1.390 + "paddw %%xmm2,%%xmm1\n\t" \ 1.391 + "paddw %%xmm5,%%xmm6\n\t" \ 1.392 + "paddw %%xmm4,%%xmm7\n\t" \ 1.393 + "paddw %%xmm2,%%xmm2\n\t" \ 1.394 + "paddw %%xmm4,%%xmm4\n\t" \ 1.395 + "paddw %%xmm5,%%xmm5\n\t" \ 1.396 + "psubw %%xmm1,%%xmm2\n\t" \ 1.397 + "psubw %%xmm7,%%xmm4\n\t" \ 1.398 + "psubw %%xmm6,%%xmm5\n\t" \ 1.399 + 1.400 +static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ 1.401 + OC_ALIGN16(ogg_int16_t buf[16]); 1.402 + /*This routine accepts an 8x8 matrix pre-transposed.*/ 1.403 + __asm__ __volatile__( 1.404 + "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" 1.405 + "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" 1.406 + "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" 1.407 + "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" 1.408 + OC_IDCT_8x8_10_MMX 1.409 + OC_TRANSPOSE_8x4_MMX2SSE 1.410 + OC_IDCT_8x8_10_ABC 1.411 + OC_IDCT_8x8_D_STORE 1.412 + :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), 1.413 + [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) 1.414 + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), 1.415 + [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) 1.416 + ); 1.417 + if(_x!=_y){ 1.418 + /*Clear input data for next block (decoder only).*/ 1.419 + __asm__ __volatile__( 1.420 + "pxor %%mm0,%%mm0\n\t" 1.421 + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" 1.422 + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" 1.423 + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" 1.424 + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" 1.425 + :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) 1.426 + ); 1.427 + } 1.428 +} 1.429 + 1.430 +/*Performs an inverse 8x8 Type-II DCT transform. 1.431 + The input is assumed to be scaled by a factor of 4 relative to orthonormal 1.432 + version of the transform.*/ 1.433 +void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ 1.434 + /*_last_zzi is subtly different from an actual count of the number of 1.435 + coefficients we decoded for this block. 1.436 + It contains the value of zzi BEFORE the final token in the block was 1.437 + decoded. 1.438 + In most cases this is an EOB token (the continuation of an EOB run from a 1.439 + previous block counts), and so this is the same as the coefficient count. 1.440 + However, in the case that the last token was NOT an EOB token, but filled 1.441 + the block up with exactly 64 coefficients, _last_zzi will be less than 64. 1.442 + Provided the last token was not a pure zero run, the minimum value it can 1.443 + be is 46, and so that doesn't affect any of the cases in this routine. 1.444 + However, if the last token WAS a pure zero run of length 63, then _last_zzi 1.445 + will be 1 while the number of coefficients decoded is 64. 1.446 + Thus, we will trigger the following special case, where the real 1.447 + coefficient count would not. 1.448 + Note also that a zero run of length 64 will give _last_zzi a value of 0, 1.449 + but we still process the DC coefficient, which might have a non-zero value 1.450 + due to DC prediction. 1.451 + Although convoluted, this is arguably the correct behavior: it allows us to 1.452 + use a smaller transform when the block ends with a long zero run instead 1.453 + of a normal EOB token. 1.454 + It could be smarter... multiple separate zero runs at the end of a block 1.455 + will fool it, but an encoder that generates these really deserves what it 1.456 + gets. 1.457 + Needless to say we inherited this approach from VP3.*/ 1.458 + /*Then perform the iDCT.*/ 1.459 + if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); 1.460 + else oc_idct8x8_slow_sse2(_y,_x); 1.461 +} 1.462 + 1.463 +#endif