media/libtheora/lib/x86/mmxidct.c

Wed, 31 Dec 2014 06:09:35 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Wed, 31 Dec 2014 06:09:35 +0100
changeset 0
6474c204b198
permissions
-rw-r--r--

Cloned upstream origin tor-browser at tor-browser-31.3.0esr-4.5-1-build1
revision ID fc1c9ff7c1b2defdbc039f12214767608f46423f for hacking purpose.

michael@0 1 /********************************************************************
michael@0 2 * *
michael@0 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 * *
michael@0 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
michael@0 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 * *
michael@0 11 ********************************************************************
michael@0 12
michael@0 13 function:
michael@0 14 last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
michael@0 15
michael@0 16 ********************************************************************/
michael@0 17
michael@0 18 /*MMX acceleration of Theora's iDCT.
michael@0 19 Originally written by Rudolf Marek, based on code from On2's VP3.*/
michael@0 20 #include "x86int.h"
michael@0 21 #include "../dct.h"
michael@0 22
michael@0 23 #if defined(OC_X86_ASM)
michael@0 24
michael@0 25 /*These are offsets into the table of constants below.*/
michael@0 26 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
michael@0 27 #define OC_COSINE_OFFSET (0)
michael@0 28 /*A row of 8's.*/
michael@0 29 #define OC_EIGHT_OFFSET (56)
michael@0 30
michael@0 31
michael@0 32
michael@0 33 /*38 cycles*/
michael@0 34 #define OC_IDCT_BEGIN(_y,_x) \
michael@0 35 "#OC_IDCT_BEGIN\n\t" \
michael@0 36 "movq "OC_I(3,_x)",%%mm2\n\t" \
michael@0 37 "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
michael@0 38 "movq %%mm2,%%mm4\n\t" \
michael@0 39 "movq "OC_J(5,_x)",%%mm7\n\t" \
michael@0 40 "pmulhw %%mm6,%%mm4\n\t" \
michael@0 41 "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
michael@0 42 "pmulhw %%mm7,%%mm6\n\t" \
michael@0 43 "movq %%mm1,%%mm5\n\t" \
michael@0 44 "pmulhw %%mm2,%%mm1\n\t" \
michael@0 45 "movq "OC_I(1,_x)",%%mm3\n\t" \
michael@0 46 "pmulhw %%mm7,%%mm5\n\t" \
michael@0 47 "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
michael@0 48 "paddw %%mm2,%%mm4\n\t" \
michael@0 49 "paddw %%mm7,%%mm6\n\t" \
michael@0 50 "paddw %%mm1,%%mm2\n\t" \
michael@0 51 "movq "OC_J(7,_x)",%%mm1\n\t" \
michael@0 52 "paddw %%mm5,%%mm7\n\t" \
michael@0 53 "movq %%mm0,%%mm5\n\t" \
michael@0 54 "pmulhw %%mm3,%%mm0\n\t" \
michael@0 55 "paddw %%mm7,%%mm4\n\t" \
michael@0 56 "pmulhw %%mm1,%%mm5\n\t" \
michael@0 57 "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \
michael@0 58 "psubw %%mm2,%%mm6\n\t" \
michael@0 59 "paddw %%mm3,%%mm0\n\t" \
michael@0 60 "pmulhw %%mm7,%%mm3\n\t" \
michael@0 61 "movq "OC_I(2,_x)",%%mm2\n\t" \
michael@0 62 "pmulhw %%mm1,%%mm7\n\t" \
michael@0 63 "paddw %%mm1,%%mm5\n\t" \
michael@0 64 "movq %%mm2,%%mm1\n\t" \
michael@0 65 "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \
michael@0 66 "psubw %%mm5,%%mm3\n\t" \
michael@0 67 "movq "OC_J(6,_x)",%%mm5\n\t" \
michael@0 68 "paddw %%mm7,%%mm0\n\t" \
michael@0 69 "movq %%mm5,%%mm7\n\t" \
michael@0 70 "psubw %%mm4,%%mm0\n\t" \
michael@0 71 "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
michael@0 72 "paddw %%mm1,%%mm2\n\t" \
michael@0 73 "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
michael@0 74 "paddw %%mm4,%%mm4\n\t" \
michael@0 75 "paddw %%mm0,%%mm4\n\t" \
michael@0 76 "psubw %%mm6,%%mm3\n\t" \
michael@0 77 "paddw %%mm7,%%mm5\n\t" \
michael@0 78 "paddw %%mm6,%%mm6\n\t" \
michael@0 79 "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \
michael@0 80 "paddw %%mm3,%%mm6\n\t" \
michael@0 81 "movq %%mm4,"OC_I(1,_y)"\n\t" \
michael@0 82 "psubw %%mm5,%%mm1\n\t" \
michael@0 83 "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
michael@0 84 "movq %%mm3,%%mm5\n\t" \
michael@0 85 "pmulhw %%mm4,%%mm3\n\t" \
michael@0 86 "paddw %%mm2,%%mm7\n\t" \
michael@0 87 "movq %%mm6,"OC_I(2,_y)"\n\t" \
michael@0 88 "movq %%mm0,%%mm2\n\t" \
michael@0 89 "movq "OC_I(0,_x)",%%mm6\n\t" \
michael@0 90 "pmulhw %%mm4,%%mm0\n\t" \
michael@0 91 "paddw %%mm3,%%mm5\n\t" \
michael@0 92 "movq "OC_J(4,_x)",%%mm3\n\t" \
michael@0 93 "psubw %%mm1,%%mm5\n\t" \
michael@0 94 "paddw %%mm0,%%mm2\n\t" \
michael@0 95 "psubw %%mm3,%%mm6\n\t" \
michael@0 96 "movq %%mm6,%%mm0\n\t" \
michael@0 97 "pmulhw %%mm4,%%mm6\n\t" \
michael@0 98 "paddw %%mm3,%%mm3\n\t" \
michael@0 99 "paddw %%mm1,%%mm1\n\t" \
michael@0 100 "paddw %%mm0,%%mm3\n\t" \
michael@0 101 "paddw %%mm5,%%mm1\n\t" \
michael@0 102 "pmulhw %%mm3,%%mm4\n\t" \
michael@0 103 "paddw %%mm0,%%mm6\n\t" \
michael@0 104 "psubw %%mm2,%%mm6\n\t" \
michael@0 105 "paddw %%mm2,%%mm2\n\t" \
michael@0 106 "movq "OC_I(1,_y)",%%mm0\n\t" \
michael@0 107 "paddw %%mm6,%%mm2\n\t" \
michael@0 108 "paddw %%mm3,%%mm4\n\t" \
michael@0 109 "psubw %%mm1,%%mm2\n\t" \
michael@0 110 "#end OC_IDCT_BEGIN\n\t" \
michael@0 111
michael@0 112 /*38+8=46 cycles.*/
michael@0 113 #define OC_ROW_IDCT(_y,_x) \
michael@0 114 "#OC_ROW_IDCT\n" \
michael@0 115 OC_IDCT_BEGIN(_y,_x) \
michael@0 116 /*r3=D'*/ \
michael@0 117 "movq "OC_I(2,_y)",%%mm3\n\t" \
michael@0 118 /*r4=E'=E-G*/ \
michael@0 119 "psubw %%mm7,%%mm4\n\t" \
michael@0 120 /*r1=H'+H'*/ \
michael@0 121 "paddw %%mm1,%%mm1\n\t" \
michael@0 122 /*r7=G+G*/ \
michael@0 123 "paddw %%mm7,%%mm7\n\t" \
michael@0 124 /*r1=R1=A''+H'*/ \
michael@0 125 "paddw %%mm2,%%mm1\n\t" \
michael@0 126 /*r7=G'=E+G*/ \
michael@0 127 "paddw %%mm4,%%mm7\n\t" \
michael@0 128 /*r4=R4=E'-D'*/ \
michael@0 129 "psubw %%mm3,%%mm4\n\t" \
michael@0 130 "paddw %%mm3,%%mm3\n\t" \
michael@0 131 /*r6=R6=F'-B''*/ \
michael@0 132 "psubw %%mm5,%%mm6\n\t" \
michael@0 133 "paddw %%mm5,%%mm5\n\t" \
michael@0 134 /*r3=R3=E'+D'*/ \
michael@0 135 "paddw %%mm4,%%mm3\n\t" \
michael@0 136 /*r5=R5=F'+B''*/ \
michael@0 137 "paddw %%mm6,%%mm5\n\t" \
michael@0 138 /*r7=R7=G'-C'*/ \
michael@0 139 "psubw %%mm0,%%mm7\n\t" \
michael@0 140 "paddw %%mm0,%%mm0\n\t" \
michael@0 141 /*Save R1.*/ \
michael@0 142 "movq %%mm1,"OC_I(1,_y)"\n\t" \
michael@0 143 /*r0=R0=G.+C.*/ \
michael@0 144 "paddw %%mm7,%%mm0\n\t" \
michael@0 145 "#end OC_ROW_IDCT\n\t" \
michael@0 146
michael@0 147 /*The following macro does two 4x4 transposes in place.
michael@0 148 At entry, we assume:
michael@0 149 r0 = a3 a2 a1 a0
michael@0 150 I(1) = b3 b2 b1 b0
michael@0 151 r2 = c3 c2 c1 c0
michael@0 152 r3 = d3 d2 d1 d0
michael@0 153
michael@0 154 r4 = e3 e2 e1 e0
michael@0 155 r5 = f3 f2 f1 f0
michael@0 156 r6 = g3 g2 g1 g0
michael@0 157 r7 = h3 h2 h1 h0
michael@0 158
michael@0 159 At exit, we have:
michael@0 160 I(0) = d0 c0 b0 a0
michael@0 161 I(1) = d1 c1 b1 a1
michael@0 162 I(2) = d2 c2 b2 a2
michael@0 163 I(3) = d3 c3 b3 a3
michael@0 164
michael@0 165 J(4) = h0 g0 f0 e0
michael@0 166 J(5) = h1 g1 f1 e1
michael@0 167 J(6) = h2 g2 f2 e2
michael@0 168 J(7) = h3 g3 f3 e3
michael@0 169
michael@0 170 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
michael@0 171 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
michael@0 172
michael@0 173 Since r1 is free at entry, we calculate the Js first.*/
michael@0 174 /*19 cycles.*/
michael@0 175 #define OC_TRANSPOSE(_y) \
michael@0 176 "#OC_TRANSPOSE\n\t" \
michael@0 177 "movq %%mm4,%%mm1\n\t" \
michael@0 178 "punpcklwd %%mm5,%%mm4\n\t" \
michael@0 179 "movq %%mm0,"OC_I(0,_y)"\n\t" \
michael@0 180 "punpckhwd %%mm5,%%mm1\n\t" \
michael@0 181 "movq %%mm6,%%mm0\n\t" \
michael@0 182 "punpcklwd %%mm7,%%mm6\n\t" \
michael@0 183 "movq %%mm4,%%mm5\n\t" \
michael@0 184 "punpckldq %%mm6,%%mm4\n\t" \
michael@0 185 "punpckhdq %%mm6,%%mm5\n\t" \
michael@0 186 "movq %%mm1,%%mm6\n\t" \
michael@0 187 "movq %%mm4,"OC_J(4,_y)"\n\t" \
michael@0 188 "punpckhwd %%mm7,%%mm0\n\t" \
michael@0 189 "movq %%mm5,"OC_J(5,_y)"\n\t" \
michael@0 190 "punpckhdq %%mm0,%%mm6\n\t" \
michael@0 191 "movq "OC_I(0,_y)",%%mm4\n\t" \
michael@0 192 "punpckldq %%mm0,%%mm1\n\t" \
michael@0 193 "movq "OC_I(1,_y)",%%mm5\n\t" \
michael@0 194 "movq %%mm4,%%mm0\n\t" \
michael@0 195 "movq %%mm6,"OC_J(7,_y)"\n\t" \
michael@0 196 "punpcklwd %%mm5,%%mm0\n\t" \
michael@0 197 "movq %%mm1,"OC_J(6,_y)"\n\t" \
michael@0 198 "punpckhwd %%mm5,%%mm4\n\t" \
michael@0 199 "movq %%mm2,%%mm5\n\t" \
michael@0 200 "punpcklwd %%mm3,%%mm2\n\t" \
michael@0 201 "movq %%mm0,%%mm1\n\t" \
michael@0 202 "punpckldq %%mm2,%%mm0\n\t" \
michael@0 203 "punpckhdq %%mm2,%%mm1\n\t" \
michael@0 204 "movq %%mm4,%%mm2\n\t" \
michael@0 205 "movq %%mm0,"OC_I(0,_y)"\n\t" \
michael@0 206 "punpckhwd %%mm3,%%mm5\n\t" \
michael@0 207 "movq %%mm1,"OC_I(1,_y)"\n\t" \
michael@0 208 "punpckhdq %%mm5,%%mm4\n\t" \
michael@0 209 "punpckldq %%mm5,%%mm2\n\t" \
michael@0 210 "movq %%mm4,"OC_I(3,_y)"\n\t" \
michael@0 211 "movq %%mm2,"OC_I(2,_y)"\n\t" \
michael@0 212 "#end OC_TRANSPOSE\n\t" \
michael@0 213
michael@0 214 /*38+19=57 cycles.*/
michael@0 215 #define OC_COLUMN_IDCT(_y) \
michael@0 216 "#OC_COLUMN_IDCT\n" \
michael@0 217 OC_IDCT_BEGIN(_y,_y) \
michael@0 218 "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
michael@0 219 /*r1=H'+H'*/ \
michael@0 220 "paddw %%mm1,%%mm1\n\t" \
michael@0 221 /*r1=R1=A''+H'*/ \
michael@0 222 "paddw %%mm2,%%mm1\n\t" \
michael@0 223 /*r2=NR2*/ \
michael@0 224 "psraw $4,%%mm2\n\t" \
michael@0 225 /*r4=E'=E-G*/ \
michael@0 226 "psubw %%mm7,%%mm4\n\t" \
michael@0 227 /*r1=NR1*/ \
michael@0 228 "psraw $4,%%mm1\n\t" \
michael@0 229 /*r3=D'*/ \
michael@0 230 "movq "OC_I(2,_y)",%%mm3\n\t" \
michael@0 231 /*r7=G+G*/ \
michael@0 232 "paddw %%mm7,%%mm7\n\t" \
michael@0 233 /*Store NR2 at I(2).*/ \
michael@0 234 "movq %%mm2,"OC_I(2,_y)"\n\t" \
michael@0 235 /*r7=G'=E+G*/ \
michael@0 236 "paddw %%mm4,%%mm7\n\t" \
michael@0 237 /*Store NR1 at I(1).*/ \
michael@0 238 "movq %%mm1,"OC_I(1,_y)"\n\t" \
michael@0 239 /*r4=R4=E'-D'*/ \
michael@0 240 "psubw %%mm3,%%mm4\n\t" \
michael@0 241 "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
michael@0 242 /*r3=D'+D'*/ \
michael@0 243 "paddw %%mm3,%%mm3\n\t" \
michael@0 244 /*r3=R3=E'+D'*/ \
michael@0 245 "paddw %%mm4,%%mm3\n\t" \
michael@0 246 /*r4=NR4*/ \
michael@0 247 "psraw $4,%%mm4\n\t" \
michael@0 248 /*r6=R6=F'-B''*/ \
michael@0 249 "psubw %%mm5,%%mm6\n\t" \
michael@0 250 /*r3=NR3*/ \
michael@0 251 "psraw $4,%%mm3\n\t" \
michael@0 252 "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
michael@0 253 /*r5=B''+B''*/ \
michael@0 254 "paddw %%mm5,%%mm5\n\t" \
michael@0 255 /*r5=R5=F'+B''*/ \
michael@0 256 "paddw %%mm6,%%mm5\n\t" \
michael@0 257 /*r6=NR6*/ \
michael@0 258 "psraw $4,%%mm6\n\t" \
michael@0 259 /*Store NR4 at J(4).*/ \
michael@0 260 "movq %%mm4,"OC_J(4,_y)"\n\t" \
michael@0 261 /*r5=NR5*/ \
michael@0 262 "psraw $4,%%mm5\n\t" \
michael@0 263 /*Store NR3 at I(3).*/ \
michael@0 264 "movq %%mm3,"OC_I(3,_y)"\n\t" \
michael@0 265 /*r7=R7=G'-C'*/ \
michael@0 266 "psubw %%mm0,%%mm7\n\t" \
michael@0 267 "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
michael@0 268 /*r0=C'+C'*/ \
michael@0 269 "paddw %%mm0,%%mm0\n\t" \
michael@0 270 /*r0=R0=G'+C'*/ \
michael@0 271 "paddw %%mm7,%%mm0\n\t" \
michael@0 272 /*r7=NR7*/ \
michael@0 273 "psraw $4,%%mm7\n\t" \
michael@0 274 /*Store NR6 at J(6).*/ \
michael@0 275 "movq %%mm6,"OC_J(6,_y)"\n\t" \
michael@0 276 /*r0=NR0*/ \
michael@0 277 "psraw $4,%%mm0\n\t" \
michael@0 278 /*Store NR5 at J(5).*/ \
michael@0 279 "movq %%mm5,"OC_J(5,_y)"\n\t" \
michael@0 280 /*Store NR7 at J(7).*/ \
michael@0 281 "movq %%mm7,"OC_J(7,_y)"\n\t" \
michael@0 282 /*Store NR0 at I(0).*/ \
michael@0 283 "movq %%mm0,"OC_I(0,_y)"\n\t" \
michael@0 284 "#end OC_COLUMN_IDCT\n\t" \
michael@0 285
michael@0 286 static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
michael@0 287 /*This routine accepts an 8x8 matrix, but in partially transposed form.
michael@0 288 Every 4x4 block is transposed.*/
michael@0 289 __asm__ __volatile__(
michael@0 290 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
michael@0 291 #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
michael@0 292 OC_ROW_IDCT(y,x)
michael@0 293 OC_TRANSPOSE(y)
michael@0 294 #undef OC_I
michael@0 295 #undef OC_J
michael@0 296 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y)
michael@0 297 #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y)
michael@0 298 OC_ROW_IDCT(y,x)
michael@0 299 OC_TRANSPOSE(y)
michael@0 300 #undef OC_I
michael@0 301 #undef OC_J
michael@0 302 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
michael@0 303 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 304 OC_COLUMN_IDCT(y)
michael@0 305 #undef OC_I
michael@0 306 #undef OC_J
michael@0 307 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
michael@0 308 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 309 OC_COLUMN_IDCT(y)
michael@0 310 #undef OC_I
michael@0 311 #undef OC_J
michael@0 312 :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
michael@0 313 :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
michael@0 314 [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
michael@0 315 );
michael@0 316 if(_x!=_y){
michael@0 317 int i;
michael@0 318 __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::);
michael@0 319 for(i=0;i<4;i++){
michael@0 320 __asm__ __volatile__(
michael@0 321 "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
michael@0 322 "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t"
michael@0 323 "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
michael@0 324 "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t"
michael@0 325 :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16)
michael@0 326 );
michael@0 327 }
michael@0 328 }
michael@0 329 }
michael@0 330
michael@0 331 /*25 cycles.*/
michael@0 332 #define OC_IDCT_BEGIN_10(_y,_x) \
michael@0 333 "#OC_IDCT_BEGIN_10\n\t" \
michael@0 334 "movq "OC_I(3,_x)",%%mm2\n\t" \
michael@0 335 "nop\n\t" \
michael@0 336 "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \
michael@0 337 "movq %%mm2,%%mm4\n\t" \
michael@0 338 "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \
michael@0 339 "pmulhw %%mm6,%%mm4\n\t" \
michael@0 340 "movq "OC_I(1,_x)",%%mm3\n\t" \
michael@0 341 "pmulhw %%mm2,%%mm1\n\t" \
michael@0 342 "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \
michael@0 343 "paddw %%mm2,%%mm4\n\t" \
michael@0 344 "pxor %%mm6,%%mm6\n\t" \
michael@0 345 "paddw %%mm1,%%mm2\n\t" \
michael@0 346 "movq "OC_I(2,_x)",%%mm5\n\t" \
michael@0 347 "pmulhw %%mm3,%%mm0\n\t" \
michael@0 348 "movq %%mm5,%%mm1\n\t" \
michael@0 349 "paddw %%mm3,%%mm0\n\t" \
michael@0 350 "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \
michael@0 351 "psubw %%mm2,%%mm6\n\t" \
michael@0 352 "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \
michael@0 353 "psubw %%mm4,%%mm0\n\t" \
michael@0 354 "movq "OC_I(2,_x)",%%mm7\n\t" \
michael@0 355 "paddw %%mm4,%%mm4\n\t" \
michael@0 356 "paddw %%mm5,%%mm7\n\t" \
michael@0 357 "paddw %%mm0,%%mm4\n\t" \
michael@0 358 "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \
michael@0 359 "psubw %%mm6,%%mm3\n\t" \
michael@0 360 "movq %%mm4,"OC_I(1,_y)"\n\t" \
michael@0 361 "paddw %%mm6,%%mm6\n\t" \
michael@0 362 "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \
michael@0 363 "paddw %%mm3,%%mm6\n\t" \
michael@0 364 "movq %%mm3,%%mm5\n\t" \
michael@0 365 "pmulhw %%mm4,%%mm3\n\t" \
michael@0 366 "movq %%mm6,"OC_I(2,_y)"\n\t" \
michael@0 367 "movq %%mm0,%%mm2\n\t" \
michael@0 368 "movq "OC_I(0,_x)",%%mm6\n\t" \
michael@0 369 "pmulhw %%mm4,%%mm0\n\t" \
michael@0 370 "paddw %%mm3,%%mm5\n\t" \
michael@0 371 "paddw %%mm0,%%mm2\n\t" \
michael@0 372 "psubw %%mm1,%%mm5\n\t" \
michael@0 373 "pmulhw %%mm4,%%mm6\n\t" \
michael@0 374 "paddw "OC_I(0,_x)",%%mm6\n\t" \
michael@0 375 "paddw %%mm1,%%mm1\n\t" \
michael@0 376 "movq %%mm6,%%mm4\n\t" \
michael@0 377 "paddw %%mm5,%%mm1\n\t" \
michael@0 378 "psubw %%mm2,%%mm6\n\t" \
michael@0 379 "paddw %%mm2,%%mm2\n\t" \
michael@0 380 "movq "OC_I(1,_y)",%%mm0\n\t" \
michael@0 381 "paddw %%mm6,%%mm2\n\t" \
michael@0 382 "psubw %%mm1,%%mm2\n\t" \
michael@0 383 "nop\n\t" \
michael@0 384 "#end OC_IDCT_BEGIN_10\n\t" \
michael@0 385
michael@0 386 /*25+8=33 cycles.*/
michael@0 387 #define OC_ROW_IDCT_10(_y,_x) \
michael@0 388 "#OC_ROW_IDCT_10\n\t" \
michael@0 389 OC_IDCT_BEGIN_10(_y,_x) \
michael@0 390 /*r3=D'*/ \
michael@0 391 "movq "OC_I(2,_y)",%%mm3\n\t" \
michael@0 392 /*r4=E'=E-G*/ \
michael@0 393 "psubw %%mm7,%%mm4\n\t" \
michael@0 394 /*r1=H'+H'*/ \
michael@0 395 "paddw %%mm1,%%mm1\n\t" \
michael@0 396 /*r7=G+G*/ \
michael@0 397 "paddw %%mm7,%%mm7\n\t" \
michael@0 398 /*r1=R1=A''+H'*/ \
michael@0 399 "paddw %%mm2,%%mm1\n\t" \
michael@0 400 /*r7=G'=E+G*/ \
michael@0 401 "paddw %%mm4,%%mm7\n\t" \
michael@0 402 /*r4=R4=E'-D'*/ \
michael@0 403 "psubw %%mm3,%%mm4\n\t" \
michael@0 404 "paddw %%mm3,%%mm3\n\t" \
michael@0 405 /*r6=R6=F'-B''*/ \
michael@0 406 "psubw %%mm5,%%mm6\n\t" \
michael@0 407 "paddw %%mm5,%%mm5\n\t" \
michael@0 408 /*r3=R3=E'+D'*/ \
michael@0 409 "paddw %%mm4,%%mm3\n\t" \
michael@0 410 /*r5=R5=F'+B''*/ \
michael@0 411 "paddw %%mm6,%%mm5\n\t" \
michael@0 412 /*r7=R7=G'-C'*/ \
michael@0 413 "psubw %%mm0,%%mm7\n\t" \
michael@0 414 "paddw %%mm0,%%mm0\n\t" \
michael@0 415 /*Save R1.*/ \
michael@0 416 "movq %%mm1,"OC_I(1,_y)"\n\t" \
michael@0 417 /*r0=R0=G'+C'*/ \
michael@0 418 "paddw %%mm7,%%mm0\n\t" \
michael@0 419 "#end OC_ROW_IDCT_10\n\t" \
michael@0 420
michael@0 421 /*25+19=44 cycles'*/
michael@0 422 #define OC_COLUMN_IDCT_10(_y) \
michael@0 423 "#OC_COLUMN_IDCT_10\n\t" \
michael@0 424 OC_IDCT_BEGIN_10(_y,_y) \
michael@0 425 "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \
michael@0 426 /*r1=H'+H'*/ \
michael@0 427 "paddw %%mm1,%%mm1\n\t" \
michael@0 428 /*r1=R1=A''+H'*/ \
michael@0 429 "paddw %%mm2,%%mm1\n\t" \
michael@0 430 /*r2=NR2*/ \
michael@0 431 "psraw $4,%%mm2\n\t" \
michael@0 432 /*r4=E'=E-G*/ \
michael@0 433 "psubw %%mm7,%%mm4\n\t" \
michael@0 434 /*r1=NR1*/ \
michael@0 435 "psraw $4,%%mm1\n\t" \
michael@0 436 /*r3=D'*/ \
michael@0 437 "movq "OC_I(2,_y)",%%mm3\n\t" \
michael@0 438 /*r7=G+G*/ \
michael@0 439 "paddw %%mm7,%%mm7\n\t" \
michael@0 440 /*Store NR2 at I(2).*/ \
michael@0 441 "movq %%mm2,"OC_I(2,_y)"\n\t" \
michael@0 442 /*r7=G'=E+G*/ \
michael@0 443 "paddw %%mm4,%%mm7\n\t" \
michael@0 444 /*Store NR1 at I(1).*/ \
michael@0 445 "movq %%mm1,"OC_I(1,_y)"\n\t" \
michael@0 446 /*r4=R4=E'-D'*/ \
michael@0 447 "psubw %%mm3,%%mm4\n\t" \
michael@0 448 "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \
michael@0 449 /*r3=D'+D'*/ \
michael@0 450 "paddw %%mm3,%%mm3\n\t" \
michael@0 451 /*r3=R3=E'+D'*/ \
michael@0 452 "paddw %%mm4,%%mm3\n\t" \
michael@0 453 /*r4=NR4*/ \
michael@0 454 "psraw $4,%%mm4\n\t" \
michael@0 455 /*r6=R6=F'-B''*/ \
michael@0 456 "psubw %%mm5,%%mm6\n\t" \
michael@0 457 /*r3=NR3*/ \
michael@0 458 "psraw $4,%%mm3\n\t" \
michael@0 459 "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \
michael@0 460 /*r5=B''+B''*/ \
michael@0 461 "paddw %%mm5,%%mm5\n\t" \
michael@0 462 /*r5=R5=F'+B''*/ \
michael@0 463 "paddw %%mm6,%%mm5\n\t" \
michael@0 464 /*r6=NR6*/ \
michael@0 465 "psraw $4,%%mm6\n\t" \
michael@0 466 /*Store NR4 at J(4).*/ \
michael@0 467 "movq %%mm4,"OC_J(4,_y)"\n\t" \
michael@0 468 /*r5=NR5*/ \
michael@0 469 "psraw $4,%%mm5\n\t" \
michael@0 470 /*Store NR3 at I(3).*/ \
michael@0 471 "movq %%mm3,"OC_I(3,_y)"\n\t" \
michael@0 472 /*r7=R7=G'-C'*/ \
michael@0 473 "psubw %%mm0,%%mm7\n\t" \
michael@0 474 "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \
michael@0 475 /*r0=C'+C'*/ \
michael@0 476 "paddw %%mm0,%%mm0\n\t" \
michael@0 477 /*r0=R0=G'+C'*/ \
michael@0 478 "paddw %%mm7,%%mm0\n\t" \
michael@0 479 /*r7=NR7*/ \
michael@0 480 "psraw $4,%%mm7\n\t" \
michael@0 481 /*Store NR6 at J(6).*/ \
michael@0 482 "movq %%mm6,"OC_J(6,_y)"\n\t" \
michael@0 483 /*r0=NR0*/ \
michael@0 484 "psraw $4,%%mm0\n\t" \
michael@0 485 /*Store NR5 at J(5).*/ \
michael@0 486 "movq %%mm5,"OC_J(5,_y)"\n\t" \
michael@0 487 /*Store NR7 at J(7).*/ \
michael@0 488 "movq %%mm7,"OC_J(7,_y)"\n\t" \
michael@0 489 /*Store NR0 at I(0).*/ \
michael@0 490 "movq %%mm0,"OC_I(0,_y)"\n\t" \
michael@0 491 "#end OC_COLUMN_IDCT_10\n\t" \
michael@0 492
michael@0 493 static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){
michael@0 494 __asm__ __volatile__(
michael@0 495 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
michael@0 496 #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y)
michael@0 497 /*Done with dequant, descramble, and partial transpose.
michael@0 498 Now do the iDCT itself.*/
michael@0 499 OC_ROW_IDCT_10(y,x)
michael@0 500 OC_TRANSPOSE(y)
michael@0 501 #undef OC_I
michael@0 502 #undef OC_J
michael@0 503 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y)
michael@0 504 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 505 OC_COLUMN_IDCT_10(y)
michael@0 506 #undef OC_I
michael@0 507 #undef OC_J
michael@0 508 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y)
michael@0 509 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 510 OC_COLUMN_IDCT_10(y)
michael@0 511 #undef OC_I
michael@0 512 #undef OC_J
michael@0 513 :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64)
michael@0 514 :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64),
michael@0 515 [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)
michael@0 516 );
michael@0 517 if(_x!=_y){
michael@0 518 __asm__ __volatile__(
michael@0 519 "pxor %%mm0,%%mm0\n\t"
michael@0 520 "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t"
michael@0 521 "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t"
michael@0 522 "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t"
michael@0 523 "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t"
michael@0 524 :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28)
michael@0 525 );
michael@0 526 }
michael@0 527 }
michael@0 528
michael@0 529 /*Performs an inverse 8x8 Type-II DCT transform.
michael@0 530 The input is assumed to be scaled by a factor of 4 relative to orthonormal
michael@0 531 version of the transform.*/
michael@0 532 void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
michael@0 533 /*_last_zzi is subtly different from an actual count of the number of
michael@0 534 coefficients we decoded for this block.
michael@0 535 It contains the value of zzi BEFORE the final token in the block was
michael@0 536 decoded.
michael@0 537 In most cases this is an EOB token (the continuation of an EOB run from a
michael@0 538 previous block counts), and so this is the same as the coefficient count.
michael@0 539 However, in the case that the last token was NOT an EOB token, but filled
michael@0 540 the block up with exactly 64 coefficients, _last_zzi will be less than 64.
michael@0 541 Provided the last token was not a pure zero run, the minimum value it can
michael@0 542 be is 46, and so that doesn't affect any of the cases in this routine.
michael@0 543 However, if the last token WAS a pure zero run of length 63, then _last_zzi
michael@0 544 will be 1 while the number of coefficients decoded is 64.
michael@0 545 Thus, we will trigger the following special case, where the real
michael@0 546 coefficient count would not.
michael@0 547 Note also that a zero run of length 64 will give _last_zzi a value of 0,
michael@0 548 but we still process the DC coefficient, which might have a non-zero value
michael@0 549 due to DC prediction.
michael@0 550 Although convoluted, this is arguably the correct behavior: it allows us to
michael@0 551 use a smaller transform when the block ends with a long zero run instead
michael@0 552 of a normal EOB token.
michael@0 553 It could be smarter... multiple separate zero runs at the end of a block
michael@0 554 will fool it, but an encoder that generates these really deserves what it
michael@0 555 gets.
michael@0 556 Needless to say we inherited this approach from VP3.*/
michael@0 557 /*Then perform the iDCT.*/
michael@0 558 if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x);
michael@0 559 else oc_idct8x8_slow_mmx(_y,_x);
michael@0 560 }
michael@0 561
michael@0 562 #endif

mercurial