1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/x86/mmxidct.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,562 @@ 1.4 +/******************************************************************** 1.5 + * * 1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 + * * 1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * 1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 + * * 1.14 + ******************************************************************** 1.15 + 1.16 + function: 1.17 + last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ 1.18 + 1.19 + ********************************************************************/ 1.20 + 1.21 +/*MMX acceleration of Theora's iDCT. 1.22 + Originally written by Rudolf Marek, based on code from On2's VP3.*/ 1.23 +#include "x86int.h" 1.24 +#include "../dct.h" 1.25 + 1.26 +#if defined(OC_X86_ASM) 1.27 + 1.28 +/*These are offsets into the table of constants below.*/ 1.29 +/*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ 1.30 +#define OC_COSINE_OFFSET (0) 1.31 +/*A row of 8's.*/ 1.32 +#define OC_EIGHT_OFFSET (56) 1.33 + 1.34 + 1.35 + 1.36 +/*38 cycles*/ 1.37 +#define OC_IDCT_BEGIN(_y,_x) \ 1.38 + "#OC_IDCT_BEGIN\n\t" \ 1.39 + "movq "OC_I(3,_x)",%%mm2\n\t" \ 1.40 + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ 1.41 + "movq %%mm2,%%mm4\n\t" \ 1.42 + "movq "OC_J(5,_x)",%%mm7\n\t" \ 1.43 + "pmulhw %%mm6,%%mm4\n\t" \ 1.44 + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ 1.45 + "pmulhw %%mm7,%%mm6\n\t" \ 1.46 + "movq %%mm1,%%mm5\n\t" \ 1.47 + "pmulhw %%mm2,%%mm1\n\t" \ 1.48 + "movq "OC_I(1,_x)",%%mm3\n\t" \ 1.49 + "pmulhw %%mm7,%%mm5\n\t" \ 1.50 + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ 1.51 + "paddw %%mm2,%%mm4\n\t" \ 1.52 + "paddw %%mm7,%%mm6\n\t" \ 1.53 + "paddw %%mm1,%%mm2\n\t" \ 1.54 + "movq "OC_J(7,_x)",%%mm1\n\t" \ 1.55 + "paddw %%mm5,%%mm7\n\t" \ 1.56 + "movq %%mm0,%%mm5\n\t" \ 1.57 + "pmulhw %%mm3,%%mm0\n\t" \ 1.58 + "paddw %%mm7,%%mm4\n\t" \ 1.59 + "pmulhw %%mm1,%%mm5\n\t" \ 1.60 + "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ 1.61 + "psubw %%mm2,%%mm6\n\t" \ 1.62 + "paddw %%mm3,%%mm0\n\t" \ 1.63 + "pmulhw %%mm7,%%mm3\n\t" \ 1.64 + "movq "OC_I(2,_x)",%%mm2\n\t" \ 1.65 + "pmulhw %%mm1,%%mm7\n\t" \ 1.66 + "paddw %%mm1,%%mm5\n\t" \ 1.67 + "movq %%mm2,%%mm1\n\t" \ 1.68 + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ 1.69 + "psubw %%mm5,%%mm3\n\t" \ 1.70 + "movq "OC_J(6,_x)",%%mm5\n\t" \ 1.71 + "paddw %%mm7,%%mm0\n\t" \ 1.72 + "movq %%mm5,%%mm7\n\t" \ 1.73 + "psubw %%mm4,%%mm0\n\t" \ 1.74 + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ 1.75 + "paddw %%mm1,%%mm2\n\t" \ 1.76 + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ 1.77 + "paddw %%mm4,%%mm4\n\t" \ 1.78 + "paddw %%mm0,%%mm4\n\t" \ 1.79 + "psubw %%mm6,%%mm3\n\t" \ 1.80 + "paddw %%mm7,%%mm5\n\t" \ 1.81 + "paddw %%mm6,%%mm6\n\t" \ 1.82 + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ 1.83 + "paddw %%mm3,%%mm6\n\t" \ 1.84 + "movq %%mm4,"OC_I(1,_y)"\n\t" \ 1.85 + "psubw %%mm5,%%mm1\n\t" \ 1.86 + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ 1.87 + "movq %%mm3,%%mm5\n\t" \ 1.88 + "pmulhw %%mm4,%%mm3\n\t" \ 1.89 + "paddw %%mm2,%%mm7\n\t" \ 1.90 + "movq %%mm6,"OC_I(2,_y)"\n\t" \ 1.91 + "movq %%mm0,%%mm2\n\t" \ 1.92 + "movq "OC_I(0,_x)",%%mm6\n\t" \ 1.93 + "pmulhw %%mm4,%%mm0\n\t" \ 1.94 + "paddw %%mm3,%%mm5\n\t" \ 1.95 + "movq "OC_J(4,_x)",%%mm3\n\t" \ 1.96 + "psubw %%mm1,%%mm5\n\t" \ 1.97 + "paddw %%mm0,%%mm2\n\t" \ 1.98 + "psubw %%mm3,%%mm6\n\t" \ 1.99 + "movq %%mm6,%%mm0\n\t" \ 1.100 + "pmulhw %%mm4,%%mm6\n\t" \ 1.101 + "paddw %%mm3,%%mm3\n\t" \ 1.102 + "paddw %%mm1,%%mm1\n\t" \ 1.103 + "paddw %%mm0,%%mm3\n\t" \ 1.104 + "paddw %%mm5,%%mm1\n\t" \ 1.105 + "pmulhw %%mm3,%%mm4\n\t" \ 1.106 + "paddw %%mm0,%%mm6\n\t" \ 1.107 + "psubw %%mm2,%%mm6\n\t" \ 1.108 + "paddw %%mm2,%%mm2\n\t" \ 1.109 + "movq "OC_I(1,_y)",%%mm0\n\t" \ 1.110 + "paddw %%mm6,%%mm2\n\t" \ 1.111 + "paddw %%mm3,%%mm4\n\t" \ 1.112 + "psubw %%mm1,%%mm2\n\t" \ 1.113 + "#end OC_IDCT_BEGIN\n\t" \ 1.114 + 1.115 +/*38+8=46 cycles.*/ 1.116 +#define OC_ROW_IDCT(_y,_x) \ 1.117 + "#OC_ROW_IDCT\n" \ 1.118 + OC_IDCT_BEGIN(_y,_x) \ 1.119 + /*r3=D'*/ \ 1.120 + "movq "OC_I(2,_y)",%%mm3\n\t" \ 1.121 + /*r4=E'=E-G*/ \ 1.122 + "psubw %%mm7,%%mm4\n\t" \ 1.123 + /*r1=H'+H'*/ \ 1.124 + "paddw %%mm1,%%mm1\n\t" \ 1.125 + /*r7=G+G*/ \ 1.126 + "paddw %%mm7,%%mm7\n\t" \ 1.127 + /*r1=R1=A''+H'*/ \ 1.128 + "paddw %%mm2,%%mm1\n\t" \ 1.129 + /*r7=G'=E+G*/ \ 1.130 + "paddw %%mm4,%%mm7\n\t" \ 1.131 + /*r4=R4=E'-D'*/ \ 1.132 + "psubw %%mm3,%%mm4\n\t" \ 1.133 + "paddw %%mm3,%%mm3\n\t" \ 1.134 + /*r6=R6=F'-B''*/ \ 1.135 + "psubw %%mm5,%%mm6\n\t" \ 1.136 + "paddw %%mm5,%%mm5\n\t" \ 1.137 + /*r3=R3=E'+D'*/ \ 1.138 + "paddw %%mm4,%%mm3\n\t" \ 1.139 + /*r5=R5=F'+B''*/ \ 1.140 + "paddw %%mm6,%%mm5\n\t" \ 1.141 + /*r7=R7=G'-C'*/ \ 1.142 + "psubw %%mm0,%%mm7\n\t" \ 1.143 + "paddw %%mm0,%%mm0\n\t" \ 1.144 + /*Save R1.*/ \ 1.145 + "movq %%mm1,"OC_I(1,_y)"\n\t" \ 1.146 + /*r0=R0=G.+C.*/ \ 1.147 + "paddw %%mm7,%%mm0\n\t" \ 1.148 + "#end OC_ROW_IDCT\n\t" \ 1.149 + 1.150 +/*The following macro does two 4x4 transposes in place. 1.151 + At entry, we assume: 1.152 + r0 = a3 a2 a1 a0 1.153 + I(1) = b3 b2 b1 b0 1.154 + r2 = c3 c2 c1 c0 1.155 + r3 = d3 d2 d1 d0 1.156 + 1.157 + r4 = e3 e2 e1 e0 1.158 + r5 = f3 f2 f1 f0 1.159 + r6 = g3 g2 g1 g0 1.160 + r7 = h3 h2 h1 h0 1.161 + 1.162 + At exit, we have: 1.163 + I(0) = d0 c0 b0 a0 1.164 + I(1) = d1 c1 b1 a1 1.165 + I(2) = d2 c2 b2 a2 1.166 + I(3) = d3 c3 b3 a3 1.167 + 1.168 + J(4) = h0 g0 f0 e0 1.169 + J(5) = h1 g1 f1 e1 1.170 + J(6) = h2 g2 f2 e2 1.171 + J(7) = h3 g3 f3 e3 1.172 + 1.173 + I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. 1.174 + J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. 1.175 + 1.176 + Since r1 is free at entry, we calculate the Js first.*/ 1.177 +/*19 cycles.*/ 1.178 +#define OC_TRANSPOSE(_y) \ 1.179 + "#OC_TRANSPOSE\n\t" \ 1.180 + "movq %%mm4,%%mm1\n\t" \ 1.181 + "punpcklwd %%mm5,%%mm4\n\t" \ 1.182 + "movq %%mm0,"OC_I(0,_y)"\n\t" \ 1.183 + "punpckhwd %%mm5,%%mm1\n\t" \ 1.184 + "movq %%mm6,%%mm0\n\t" \ 1.185 + "punpcklwd %%mm7,%%mm6\n\t" \ 1.186 + "movq %%mm4,%%mm5\n\t" \ 1.187 + "punpckldq %%mm6,%%mm4\n\t" \ 1.188 + "punpckhdq %%mm6,%%mm5\n\t" \ 1.189 + "movq %%mm1,%%mm6\n\t" \ 1.190 + "movq %%mm4,"OC_J(4,_y)"\n\t" \ 1.191 + "punpckhwd %%mm7,%%mm0\n\t" \ 1.192 + "movq %%mm5,"OC_J(5,_y)"\n\t" \ 1.193 + "punpckhdq %%mm0,%%mm6\n\t" \ 1.194 + "movq "OC_I(0,_y)",%%mm4\n\t" \ 1.195 + "punpckldq %%mm0,%%mm1\n\t" \ 1.196 + "movq "OC_I(1,_y)",%%mm5\n\t" \ 1.197 + "movq %%mm4,%%mm0\n\t" \ 1.198 + "movq %%mm6,"OC_J(7,_y)"\n\t" \ 1.199 + "punpcklwd %%mm5,%%mm0\n\t" \ 1.200 + "movq %%mm1,"OC_J(6,_y)"\n\t" \ 1.201 + "punpckhwd %%mm5,%%mm4\n\t" \ 1.202 + "movq %%mm2,%%mm5\n\t" \ 1.203 + "punpcklwd %%mm3,%%mm2\n\t" \ 1.204 + "movq %%mm0,%%mm1\n\t" \ 1.205 + "punpckldq %%mm2,%%mm0\n\t" \ 1.206 + "punpckhdq %%mm2,%%mm1\n\t" \ 1.207 + "movq %%mm4,%%mm2\n\t" \ 1.208 + "movq %%mm0,"OC_I(0,_y)"\n\t" \ 1.209 + "punpckhwd %%mm3,%%mm5\n\t" \ 1.210 + "movq %%mm1,"OC_I(1,_y)"\n\t" \ 1.211 + "punpckhdq %%mm5,%%mm4\n\t" \ 1.212 + "punpckldq %%mm5,%%mm2\n\t" \ 1.213 + "movq %%mm4,"OC_I(3,_y)"\n\t" \ 1.214 + "movq %%mm2,"OC_I(2,_y)"\n\t" \ 1.215 + "#end OC_TRANSPOSE\n\t" \ 1.216 + 1.217 +/*38+19=57 cycles.*/ 1.218 +#define OC_COLUMN_IDCT(_y) \ 1.219 + "#OC_COLUMN_IDCT\n" \ 1.220 + OC_IDCT_BEGIN(_y,_y) \ 1.221 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ 1.222 + /*r1=H'+H'*/ \ 1.223 + "paddw %%mm1,%%mm1\n\t" \ 1.224 + /*r1=R1=A''+H'*/ \ 1.225 + "paddw %%mm2,%%mm1\n\t" \ 1.226 + /*r2=NR2*/ \ 1.227 + "psraw $4,%%mm2\n\t" \ 1.228 + /*r4=E'=E-G*/ \ 1.229 + "psubw %%mm7,%%mm4\n\t" \ 1.230 + /*r1=NR1*/ \ 1.231 + "psraw $4,%%mm1\n\t" \ 1.232 + /*r3=D'*/ \ 1.233 + "movq "OC_I(2,_y)",%%mm3\n\t" \ 1.234 + /*r7=G+G*/ \ 1.235 + "paddw %%mm7,%%mm7\n\t" \ 1.236 + /*Store NR2 at I(2).*/ \ 1.237 + "movq %%mm2,"OC_I(2,_y)"\n\t" \ 1.238 + /*r7=G'=E+G*/ \ 1.239 + "paddw %%mm4,%%mm7\n\t" \ 1.240 + /*Store NR1 at I(1).*/ \ 1.241 + "movq %%mm1,"OC_I(1,_y)"\n\t" \ 1.242 + /*r4=R4=E'-D'*/ \ 1.243 + "psubw %%mm3,%%mm4\n\t" \ 1.244 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ 1.245 + /*r3=D'+D'*/ \ 1.246 + "paddw %%mm3,%%mm3\n\t" \ 1.247 + /*r3=R3=E'+D'*/ \ 1.248 + "paddw %%mm4,%%mm3\n\t" \ 1.249 + /*r4=NR4*/ \ 1.250 + "psraw $4,%%mm4\n\t" \ 1.251 + /*r6=R6=F'-B''*/ \ 1.252 + "psubw %%mm5,%%mm6\n\t" \ 1.253 + /*r3=NR3*/ \ 1.254 + "psraw $4,%%mm3\n\t" \ 1.255 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ 1.256 + /*r5=B''+B''*/ \ 1.257 + "paddw %%mm5,%%mm5\n\t" \ 1.258 + /*r5=R5=F'+B''*/ \ 1.259 + "paddw %%mm6,%%mm5\n\t" \ 1.260 + /*r6=NR6*/ \ 1.261 + "psraw $4,%%mm6\n\t" \ 1.262 + /*Store NR4 at J(4).*/ \ 1.263 + "movq %%mm4,"OC_J(4,_y)"\n\t" \ 1.264 + /*r5=NR5*/ \ 1.265 + "psraw $4,%%mm5\n\t" \ 1.266 + /*Store NR3 at I(3).*/ \ 1.267 + "movq %%mm3,"OC_I(3,_y)"\n\t" \ 1.268 + /*r7=R7=G'-C'*/ \ 1.269 + "psubw %%mm0,%%mm7\n\t" \ 1.270 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ 1.271 + /*r0=C'+C'*/ \ 1.272 + "paddw %%mm0,%%mm0\n\t" \ 1.273 + /*r0=R0=G'+C'*/ \ 1.274 + "paddw %%mm7,%%mm0\n\t" \ 1.275 + /*r7=NR7*/ \ 1.276 + "psraw $4,%%mm7\n\t" \ 1.277 + /*Store NR6 at J(6).*/ \ 1.278 + "movq %%mm6,"OC_J(6,_y)"\n\t" \ 1.279 + /*r0=NR0*/ \ 1.280 + "psraw $4,%%mm0\n\t" \ 1.281 + /*Store NR5 at J(5).*/ \ 1.282 + "movq %%mm5,"OC_J(5,_y)"\n\t" \ 1.283 + /*Store NR7 at J(7).*/ \ 1.284 + "movq %%mm7,"OC_J(7,_y)"\n\t" \ 1.285 + /*Store NR0 at I(0).*/ \ 1.286 + "movq %%mm0,"OC_I(0,_y)"\n\t" \ 1.287 + "#end OC_COLUMN_IDCT\n\t" \ 1.288 + 1.289 +static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ 1.290 + /*This routine accepts an 8x8 matrix, but in partially transposed form. 1.291 + Every 4x4 block is transposed.*/ 1.292 + __asm__ __volatile__( 1.293 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) 1.294 +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) 1.295 + OC_ROW_IDCT(y,x) 1.296 + OC_TRANSPOSE(y) 1.297 +#undef OC_I 1.298 +#undef OC_J 1.299 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) 1.300 +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) 1.301 + OC_ROW_IDCT(y,x) 1.302 + OC_TRANSPOSE(y) 1.303 +#undef OC_I 1.304 +#undef OC_J 1.305 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) 1.306 +#define OC_J(_k,_y) OC_I(_k,_y) 1.307 + OC_COLUMN_IDCT(y) 1.308 +#undef OC_I 1.309 +#undef OC_J 1.310 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) 1.311 +#define OC_J(_k,_y) OC_I(_k,_y) 1.312 + OC_COLUMN_IDCT(y) 1.313 +#undef OC_I 1.314 +#undef OC_J 1.315 + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) 1.316 + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), 1.317 + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) 1.318 + ); 1.319 + if(_x!=_y){ 1.320 + int i; 1.321 + __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); 1.322 + for(i=0;i<4;i++){ 1.323 + __asm__ __volatile__( 1.324 + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" 1.325 + "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" 1.326 + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" 1.327 + "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" 1.328 + :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) 1.329 + ); 1.330 + } 1.331 + } 1.332 +} 1.333 + 1.334 +/*25 cycles.*/ 1.335 +#define OC_IDCT_BEGIN_10(_y,_x) \ 1.336 + "#OC_IDCT_BEGIN_10\n\t" \ 1.337 + "movq "OC_I(3,_x)",%%mm2\n\t" \ 1.338 + "nop\n\t" \ 1.339 + "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ 1.340 + "movq %%mm2,%%mm4\n\t" \ 1.341 + "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ 1.342 + "pmulhw %%mm6,%%mm4\n\t" \ 1.343 + "movq "OC_I(1,_x)",%%mm3\n\t" \ 1.344 + "pmulhw %%mm2,%%mm1\n\t" \ 1.345 + "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ 1.346 + "paddw %%mm2,%%mm4\n\t" \ 1.347 + "pxor %%mm6,%%mm6\n\t" \ 1.348 + "paddw %%mm1,%%mm2\n\t" \ 1.349 + "movq "OC_I(2,_x)",%%mm5\n\t" \ 1.350 + "pmulhw %%mm3,%%mm0\n\t" \ 1.351 + "movq %%mm5,%%mm1\n\t" \ 1.352 + "paddw %%mm3,%%mm0\n\t" \ 1.353 + "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ 1.354 + "psubw %%mm2,%%mm6\n\t" \ 1.355 + "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ 1.356 + "psubw %%mm4,%%mm0\n\t" \ 1.357 + "movq "OC_I(2,_x)",%%mm7\n\t" \ 1.358 + "paddw %%mm4,%%mm4\n\t" \ 1.359 + "paddw %%mm5,%%mm7\n\t" \ 1.360 + "paddw %%mm0,%%mm4\n\t" \ 1.361 + "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ 1.362 + "psubw %%mm6,%%mm3\n\t" \ 1.363 + "movq %%mm4,"OC_I(1,_y)"\n\t" \ 1.364 + "paddw %%mm6,%%mm6\n\t" \ 1.365 + "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ 1.366 + "paddw %%mm3,%%mm6\n\t" \ 1.367 + "movq %%mm3,%%mm5\n\t" \ 1.368 + "pmulhw %%mm4,%%mm3\n\t" \ 1.369 + "movq %%mm6,"OC_I(2,_y)"\n\t" \ 1.370 + "movq %%mm0,%%mm2\n\t" \ 1.371 + "movq "OC_I(0,_x)",%%mm6\n\t" \ 1.372 + "pmulhw %%mm4,%%mm0\n\t" \ 1.373 + "paddw %%mm3,%%mm5\n\t" \ 1.374 + "paddw %%mm0,%%mm2\n\t" \ 1.375 + "psubw %%mm1,%%mm5\n\t" \ 1.376 + "pmulhw %%mm4,%%mm6\n\t" \ 1.377 + "paddw "OC_I(0,_x)",%%mm6\n\t" \ 1.378 + "paddw %%mm1,%%mm1\n\t" \ 1.379 + "movq %%mm6,%%mm4\n\t" \ 1.380 + "paddw %%mm5,%%mm1\n\t" \ 1.381 + "psubw %%mm2,%%mm6\n\t" \ 1.382 + "paddw %%mm2,%%mm2\n\t" \ 1.383 + "movq "OC_I(1,_y)",%%mm0\n\t" \ 1.384 + "paddw %%mm6,%%mm2\n\t" \ 1.385 + "psubw %%mm1,%%mm2\n\t" \ 1.386 + "nop\n\t" \ 1.387 + "#end OC_IDCT_BEGIN_10\n\t" \ 1.388 + 1.389 +/*25+8=33 cycles.*/ 1.390 +#define OC_ROW_IDCT_10(_y,_x) \ 1.391 + "#OC_ROW_IDCT_10\n\t" \ 1.392 + OC_IDCT_BEGIN_10(_y,_x) \ 1.393 + /*r3=D'*/ \ 1.394 + "movq "OC_I(2,_y)",%%mm3\n\t" \ 1.395 + /*r4=E'=E-G*/ \ 1.396 + "psubw %%mm7,%%mm4\n\t" \ 1.397 + /*r1=H'+H'*/ \ 1.398 + "paddw %%mm1,%%mm1\n\t" \ 1.399 + /*r7=G+G*/ \ 1.400 + "paddw %%mm7,%%mm7\n\t" \ 1.401 + /*r1=R1=A''+H'*/ \ 1.402 + "paddw %%mm2,%%mm1\n\t" \ 1.403 + /*r7=G'=E+G*/ \ 1.404 + "paddw %%mm4,%%mm7\n\t" \ 1.405 + /*r4=R4=E'-D'*/ \ 1.406 + "psubw %%mm3,%%mm4\n\t" \ 1.407 + "paddw %%mm3,%%mm3\n\t" \ 1.408 + /*r6=R6=F'-B''*/ \ 1.409 + "psubw %%mm5,%%mm6\n\t" \ 1.410 + "paddw %%mm5,%%mm5\n\t" \ 1.411 + /*r3=R3=E'+D'*/ \ 1.412 + "paddw %%mm4,%%mm3\n\t" \ 1.413 + /*r5=R5=F'+B''*/ \ 1.414 + "paddw %%mm6,%%mm5\n\t" \ 1.415 + /*r7=R7=G'-C'*/ \ 1.416 + "psubw %%mm0,%%mm7\n\t" \ 1.417 + "paddw %%mm0,%%mm0\n\t" \ 1.418 + /*Save R1.*/ \ 1.419 + "movq %%mm1,"OC_I(1,_y)"\n\t" \ 1.420 + /*r0=R0=G'+C'*/ \ 1.421 + "paddw %%mm7,%%mm0\n\t" \ 1.422 + "#end OC_ROW_IDCT_10\n\t" \ 1.423 + 1.424 +/*25+19=44 cycles'*/ 1.425 +#define OC_COLUMN_IDCT_10(_y) \ 1.426 + "#OC_COLUMN_IDCT_10\n\t" \ 1.427 + OC_IDCT_BEGIN_10(_y,_y) \ 1.428 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ 1.429 + /*r1=H'+H'*/ \ 1.430 + "paddw %%mm1,%%mm1\n\t" \ 1.431 + /*r1=R1=A''+H'*/ \ 1.432 + "paddw %%mm2,%%mm1\n\t" \ 1.433 + /*r2=NR2*/ \ 1.434 + "psraw $4,%%mm2\n\t" \ 1.435 + /*r4=E'=E-G*/ \ 1.436 + "psubw %%mm7,%%mm4\n\t" \ 1.437 + /*r1=NR1*/ \ 1.438 + "psraw $4,%%mm1\n\t" \ 1.439 + /*r3=D'*/ \ 1.440 + "movq "OC_I(2,_y)",%%mm3\n\t" \ 1.441 + /*r7=G+G*/ \ 1.442 + "paddw %%mm7,%%mm7\n\t" \ 1.443 + /*Store NR2 at I(2).*/ \ 1.444 + "movq %%mm2,"OC_I(2,_y)"\n\t" \ 1.445 + /*r7=G'=E+G*/ \ 1.446 + "paddw %%mm4,%%mm7\n\t" \ 1.447 + /*Store NR1 at I(1).*/ \ 1.448 + "movq %%mm1,"OC_I(1,_y)"\n\t" \ 1.449 + /*r4=R4=E'-D'*/ \ 1.450 + "psubw %%mm3,%%mm4\n\t" \ 1.451 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ 1.452 + /*r3=D'+D'*/ \ 1.453 + "paddw %%mm3,%%mm3\n\t" \ 1.454 + /*r3=R3=E'+D'*/ \ 1.455 + "paddw %%mm4,%%mm3\n\t" \ 1.456 + /*r4=NR4*/ \ 1.457 + "psraw $4,%%mm4\n\t" \ 1.458 + /*r6=R6=F'-B''*/ \ 1.459 + "psubw %%mm5,%%mm6\n\t" \ 1.460 + /*r3=NR3*/ \ 1.461 + "psraw $4,%%mm3\n\t" \ 1.462 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ 1.463 + /*r5=B''+B''*/ \ 1.464 + "paddw %%mm5,%%mm5\n\t" \ 1.465 + /*r5=R5=F'+B''*/ \ 1.466 + "paddw %%mm6,%%mm5\n\t" \ 1.467 + /*r6=NR6*/ \ 1.468 + "psraw $4,%%mm6\n\t" \ 1.469 + /*Store NR4 at J(4).*/ \ 1.470 + "movq %%mm4,"OC_J(4,_y)"\n\t" \ 1.471 + /*r5=NR5*/ \ 1.472 + "psraw $4,%%mm5\n\t" \ 1.473 + /*Store NR3 at I(3).*/ \ 1.474 + "movq %%mm3,"OC_I(3,_y)"\n\t" \ 1.475 + /*r7=R7=G'-C'*/ \ 1.476 + "psubw %%mm0,%%mm7\n\t" \ 1.477 + "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ 1.478 + /*r0=C'+C'*/ \ 1.479 + "paddw %%mm0,%%mm0\n\t" \ 1.480 + /*r0=R0=G'+C'*/ \ 1.481 + "paddw %%mm7,%%mm0\n\t" \ 1.482 + /*r7=NR7*/ \ 1.483 + "psraw $4,%%mm7\n\t" \ 1.484 + /*Store NR6 at J(6).*/ \ 1.485 + "movq %%mm6,"OC_J(6,_y)"\n\t" \ 1.486 + /*r0=NR0*/ \ 1.487 + "psraw $4,%%mm0\n\t" \ 1.488 + /*Store NR5 at J(5).*/ \ 1.489 + "movq %%mm5,"OC_J(5,_y)"\n\t" \ 1.490 + /*Store NR7 at J(7).*/ \ 1.491 + "movq %%mm7,"OC_J(7,_y)"\n\t" \ 1.492 + /*Store NR0 at I(0).*/ \ 1.493 + "movq %%mm0,"OC_I(0,_y)"\n\t" \ 1.494 + "#end OC_COLUMN_IDCT_10\n\t" \ 1.495 + 1.496 +static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ 1.497 + __asm__ __volatile__( 1.498 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) 1.499 +#define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) 1.500 + /*Done with dequant, descramble, and partial transpose. 1.501 + Now do the iDCT itself.*/ 1.502 + OC_ROW_IDCT_10(y,x) 1.503 + OC_TRANSPOSE(y) 1.504 +#undef OC_I 1.505 +#undef OC_J 1.506 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) 1.507 +#define OC_J(_k,_y) OC_I(_k,_y) 1.508 + OC_COLUMN_IDCT_10(y) 1.509 +#undef OC_I 1.510 +#undef OC_J 1.511 +#define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) 1.512 +#define OC_J(_k,_y) OC_I(_k,_y) 1.513 + OC_COLUMN_IDCT_10(y) 1.514 +#undef OC_I 1.515 +#undef OC_J 1.516 + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) 1.517 + :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), 1.518 + [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) 1.519 + ); 1.520 + if(_x!=_y){ 1.521 + __asm__ __volatile__( 1.522 + "pxor %%mm0,%%mm0\n\t" 1.523 + "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" 1.524 + "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" 1.525 + "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" 1.526 + "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" 1.527 + :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) 1.528 + ); 1.529 + } 1.530 +} 1.531 + 1.532 +/*Performs an inverse 8x8 Type-II DCT transform. 1.533 + The input is assumed to be scaled by a factor of 4 relative to orthonormal 1.534 + version of the transform.*/ 1.535 +void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ 1.536 + /*_last_zzi is subtly different from an actual count of the number of 1.537 + coefficients we decoded for this block. 1.538 + It contains the value of zzi BEFORE the final token in the block was 1.539 + decoded. 1.540 + In most cases this is an EOB token (the continuation of an EOB run from a 1.541 + previous block counts), and so this is the same as the coefficient count. 1.542 + However, in the case that the last token was NOT an EOB token, but filled 1.543 + the block up with exactly 64 coefficients, _last_zzi will be less than 64. 1.544 + Provided the last token was not a pure zero run, the minimum value it can 1.545 + be is 46, and so that doesn't affect any of the cases in this routine. 1.546 + However, if the last token WAS a pure zero run of length 63, then _last_zzi 1.547 + will be 1 while the number of coefficients decoded is 64. 1.548 + Thus, we will trigger the following special case, where the real 1.549 + coefficient count would not. 1.550 + Note also that a zero run of length 64 will give _last_zzi a value of 0, 1.551 + but we still process the DC coefficient, which might have a non-zero value 1.552 + due to DC prediction. 1.553 + Although convoluted, this is arguably the correct behavior: it allows us to 1.554 + use a smaller transform when the block ends with a long zero run instead 1.555 + of a normal EOB token. 1.556 + It could be smarter... multiple separate zero runs at the end of a block 1.557 + will fool it, but an encoder that generates these really deserves what it 1.558 + gets. 1.559 + Needless to say we inherited this approach from VP3.*/ 1.560 + /*Then perform the iDCT.*/ 1.561 + if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); 1.562 + else oc_idct8x8_slow_mmx(_y,_x); 1.563 +} 1.564 + 1.565 +#endif