media/libtheora/lib/x86_vc/mmxidct.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

michael@0 1 /********************************************************************
michael@0 2 * *
michael@0 3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. *
michael@0 4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS *
michael@0 5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
michael@0 6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. *
michael@0 7 * *
michael@0 8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 *
michael@0 9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
michael@0 10 * *
michael@0 11 ********************************************************************
michael@0 12
michael@0 13 function:
michael@0 14 last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $
michael@0 15
michael@0 16 ********************************************************************/
michael@0 17
michael@0 18 /*MMX acceleration of Theora's iDCT.
michael@0 19 Originally written by Rudolf Marek, based on code from On2's VP3.*/
michael@0 20 #include "x86int.h"
michael@0 21 #include "../dct.h"
michael@0 22
michael@0 23 #if defined(OC_X86_ASM)
michael@0 24
michael@0 25 /*These are offsets into the table of constants below.*/
michael@0 26 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/
michael@0 27 #define OC_COSINE_OFFSET (8)
michael@0 28 /*A row of 8's.*/
michael@0 29 #define OC_EIGHT_OFFSET (0)
michael@0 30
michael@0 31
michael@0 32
michael@0 33 /*A table of constants used by the MMX routines.*/
michael@0 34 static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={
michael@0 35 8, 8, 8, 8,
michael@0 36 (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
michael@0 37 (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7,
michael@0 38 (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
michael@0 39 (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6,
michael@0 40 (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
michael@0 41 (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5,
michael@0 42 (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
michael@0 43 (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4,
michael@0 44 (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
michael@0 45 (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3,
michael@0 46 (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
michael@0 47 (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2,
michael@0 48 (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1,
michael@0 49 (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1
michael@0 50 };
michael@0 51
michael@0 52 /*38 cycles*/
michael@0 53 #define OC_IDCT_BEGIN(_y,_x) __asm{ \
michael@0 54 __asm movq mm2,OC_I(3,_x) \
michael@0 55 __asm movq mm6,OC_C(3) \
michael@0 56 __asm movq mm4,mm2 \
michael@0 57 __asm movq mm7,OC_J(5,_x) \
michael@0 58 __asm pmulhw mm4,mm6 \
michael@0 59 __asm movq mm1,OC_C(5) \
michael@0 60 __asm pmulhw mm6,mm7 \
michael@0 61 __asm movq mm5,mm1 \
michael@0 62 __asm pmulhw mm1,mm2 \
michael@0 63 __asm movq mm3,OC_I(1,_x) \
michael@0 64 __asm pmulhw mm5,mm7 \
michael@0 65 __asm movq mm0,OC_C(1) \
michael@0 66 __asm paddw mm4,mm2 \
michael@0 67 __asm paddw mm6,mm7 \
michael@0 68 __asm paddw mm2,mm1 \
michael@0 69 __asm movq mm1,OC_J(7,_x) \
michael@0 70 __asm paddw mm7,mm5 \
michael@0 71 __asm movq mm5,mm0 \
michael@0 72 __asm pmulhw mm0,mm3 \
michael@0 73 __asm paddw mm4,mm7 \
michael@0 74 __asm pmulhw mm5,mm1 \
michael@0 75 __asm movq mm7,OC_C(7) \
michael@0 76 __asm psubw mm6,mm2 \
michael@0 77 __asm paddw mm0,mm3 \
michael@0 78 __asm pmulhw mm3,mm7 \
michael@0 79 __asm movq mm2,OC_I(2,_x) \
michael@0 80 __asm pmulhw mm7,mm1 \
michael@0 81 __asm paddw mm5,mm1 \
michael@0 82 __asm movq mm1,mm2 \
michael@0 83 __asm pmulhw mm2,OC_C(2) \
michael@0 84 __asm psubw mm3,mm5 \
michael@0 85 __asm movq mm5,OC_J(6,_x) \
michael@0 86 __asm paddw mm0,mm7 \
michael@0 87 __asm movq mm7,mm5 \
michael@0 88 __asm psubw mm0,mm4 \
michael@0 89 __asm pmulhw mm5,OC_C(2) \
michael@0 90 __asm paddw mm2,mm1 \
michael@0 91 __asm pmulhw mm1,OC_C(6) \
michael@0 92 __asm paddw mm4,mm4 \
michael@0 93 __asm paddw mm4,mm0 \
michael@0 94 __asm psubw mm3,mm6 \
michael@0 95 __asm paddw mm5,mm7 \
michael@0 96 __asm paddw mm6,mm6 \
michael@0 97 __asm pmulhw mm7,OC_C(6) \
michael@0 98 __asm paddw mm6,mm3 \
michael@0 99 __asm movq OC_I(1,_y),mm4 \
michael@0 100 __asm psubw mm1,mm5 \
michael@0 101 __asm movq mm4,OC_C(4) \
michael@0 102 __asm movq mm5,mm3 \
michael@0 103 __asm pmulhw mm3,mm4 \
michael@0 104 __asm paddw mm7,mm2 \
michael@0 105 __asm movq OC_I(2,_y),mm6 \
michael@0 106 __asm movq mm2,mm0 \
michael@0 107 __asm movq mm6,OC_I(0,_x) \
michael@0 108 __asm pmulhw mm0,mm4 \
michael@0 109 __asm paddw mm5,mm3 \
michael@0 110 __asm movq mm3,OC_J(4,_x) \
michael@0 111 __asm psubw mm5,mm1 \
michael@0 112 __asm paddw mm2,mm0 \
michael@0 113 __asm psubw mm6,mm3 \
michael@0 114 __asm movq mm0,mm6 \
michael@0 115 __asm pmulhw mm6,mm4 \
michael@0 116 __asm paddw mm3,mm3 \
michael@0 117 __asm paddw mm1,mm1 \
michael@0 118 __asm paddw mm3,mm0 \
michael@0 119 __asm paddw mm1,mm5 \
michael@0 120 __asm pmulhw mm4,mm3 \
michael@0 121 __asm paddw mm6,mm0 \
michael@0 122 __asm psubw mm6,mm2 \
michael@0 123 __asm paddw mm2,mm2 \
michael@0 124 __asm movq mm0,OC_I(1,_y) \
michael@0 125 __asm paddw mm2,mm6 \
michael@0 126 __asm paddw mm4,mm3 \
michael@0 127 __asm psubw mm2,mm1 \
michael@0 128 }
michael@0 129
michael@0 130 /*38+8=46 cycles.*/
michael@0 131 #define OC_ROW_IDCT(_y,_x) __asm{ \
michael@0 132 OC_IDCT_BEGIN(_y,_x) \
michael@0 133 /*r3=D'*/ \
michael@0 134 __asm movq mm3,OC_I(2,_y) \
michael@0 135 /*r4=E'=E-G*/ \
michael@0 136 __asm psubw mm4,mm7 \
michael@0 137 /*r1=H'+H'*/ \
michael@0 138 __asm paddw mm1,mm1 \
michael@0 139 /*r7=G+G*/ \
michael@0 140 __asm paddw mm7,mm7 \
michael@0 141 /*r1=R1=A''+H'*/ \
michael@0 142 __asm paddw mm1,mm2 \
michael@0 143 /*r7=G'=E+G*/ \
michael@0 144 __asm paddw mm7,mm4 \
michael@0 145 /*r4=R4=E'-D'*/ \
michael@0 146 __asm psubw mm4,mm3 \
michael@0 147 __asm paddw mm3,mm3 \
michael@0 148 /*r6=R6=F'-B''*/ \
michael@0 149 __asm psubw mm6,mm5 \
michael@0 150 __asm paddw mm5,mm5 \
michael@0 151 /*r3=R3=E'+D'*/ \
michael@0 152 __asm paddw mm3,mm4 \
michael@0 153 /*r5=R5=F'+B''*/ \
michael@0 154 __asm paddw mm5,mm6 \
michael@0 155 /*r7=R7=G'-C'*/ \
michael@0 156 __asm psubw mm7,mm0 \
michael@0 157 __asm paddw mm0,mm0 \
michael@0 158 /*Save R1.*/ \
michael@0 159 __asm movq OC_I(1,_y),mm1 \
michael@0 160 /*r0=R0=G.+C.*/ \
michael@0 161 __asm paddw mm0,mm7 \
michael@0 162 }
michael@0 163
michael@0 164 /*The following macro does two 4x4 transposes in place.
michael@0 165 At entry, we assume:
michael@0 166 r0 = a3 a2 a1 a0
michael@0 167 I(1) = b3 b2 b1 b0
michael@0 168 r2 = c3 c2 c1 c0
michael@0 169 r3 = d3 d2 d1 d0
michael@0 170
michael@0 171 r4 = e3 e2 e1 e0
michael@0 172 r5 = f3 f2 f1 f0
michael@0 173 r6 = g3 g2 g1 g0
michael@0 174 r7 = h3 h2 h1 h0
michael@0 175
michael@0 176 At exit, we have:
michael@0 177 I(0) = d0 c0 b0 a0
michael@0 178 I(1) = d1 c1 b1 a1
michael@0 179 I(2) = d2 c2 b2 a2
michael@0 180 I(3) = d3 c3 b3 a3
michael@0 181
michael@0 182 J(4) = h0 g0 f0 e0
michael@0 183 J(5) = h1 g1 f1 e1
michael@0 184 J(6) = h2 g2 f2 e2
michael@0 185 J(7) = h3 g3 f3 e3
michael@0 186
michael@0 187 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
michael@0 188 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
michael@0 189
michael@0 190 Since r1 is free at entry, we calculate the Js first.*/
michael@0 191 /*19 cycles.*/
michael@0 192 #define OC_TRANSPOSE(_y) __asm{ \
michael@0 193 __asm movq mm1,mm4 \
michael@0 194 __asm punpcklwd mm4,mm5 \
michael@0 195 __asm movq OC_I(0,_y),mm0 \
michael@0 196 __asm punpckhwd mm1,mm5 \
michael@0 197 __asm movq mm0,mm6 \
michael@0 198 __asm punpcklwd mm6,mm7 \
michael@0 199 __asm movq mm5,mm4 \
michael@0 200 __asm punpckldq mm4,mm6 \
michael@0 201 __asm punpckhdq mm5,mm6 \
michael@0 202 __asm movq mm6,mm1 \
michael@0 203 __asm movq OC_J(4,_y),mm4 \
michael@0 204 __asm punpckhwd mm0,mm7 \
michael@0 205 __asm movq OC_J(5,_y),mm5 \
michael@0 206 __asm punpckhdq mm6,mm0 \
michael@0 207 __asm movq mm4,OC_I(0,_y) \
michael@0 208 __asm punpckldq mm1,mm0 \
michael@0 209 __asm movq mm5,OC_I(1,_y) \
michael@0 210 __asm movq mm0,mm4 \
michael@0 211 __asm movq OC_J(7,_y),mm6 \
michael@0 212 __asm punpcklwd mm0,mm5 \
michael@0 213 __asm movq OC_J(6,_y),mm1 \
michael@0 214 __asm punpckhwd mm4,mm5 \
michael@0 215 __asm movq mm5,mm2 \
michael@0 216 __asm punpcklwd mm2,mm3 \
michael@0 217 __asm movq mm1,mm0 \
michael@0 218 __asm punpckldq mm0,mm2 \
michael@0 219 __asm punpckhdq mm1,mm2 \
michael@0 220 __asm movq mm2,mm4 \
michael@0 221 __asm movq OC_I(0,_y),mm0 \
michael@0 222 __asm punpckhwd mm5,mm3 \
michael@0 223 __asm movq OC_I(1,_y),mm1 \
michael@0 224 __asm punpckhdq mm4,mm5 \
michael@0 225 __asm punpckldq mm2,mm5 \
michael@0 226 __asm movq OC_I(3,_y),mm4 \
michael@0 227 __asm movq OC_I(2,_y),mm2 \
michael@0 228 }
michael@0 229
michael@0 230 /*38+19=57 cycles.*/
michael@0 231 #define OC_COLUMN_IDCT(_y) __asm{ \
michael@0 232 OC_IDCT_BEGIN(_y,_y) \
michael@0 233 __asm paddw mm2,OC_8 \
michael@0 234 /*r1=H'+H'*/ \
michael@0 235 __asm paddw mm1,mm1 \
michael@0 236 /*r1=R1=A''+H'*/ \
michael@0 237 __asm paddw mm1,mm2 \
michael@0 238 /*r2=NR2*/ \
michael@0 239 __asm psraw mm2,4 \
michael@0 240 /*r4=E'=E-G*/ \
michael@0 241 __asm psubw mm4,mm7 \
michael@0 242 /*r1=NR1*/ \
michael@0 243 __asm psraw mm1,4 \
michael@0 244 /*r3=D'*/ \
michael@0 245 __asm movq mm3,OC_I(2,_y) \
michael@0 246 /*r7=G+G*/ \
michael@0 247 __asm paddw mm7,mm7 \
michael@0 248 /*Store NR2 at I(2).*/ \
michael@0 249 __asm movq OC_I(2,_y),mm2 \
michael@0 250 /*r7=G'=E+G*/ \
michael@0 251 __asm paddw mm7,mm4 \
michael@0 252 /*Store NR1 at I(1).*/ \
michael@0 253 __asm movq OC_I(1,_y),mm1 \
michael@0 254 /*r4=R4=E'-D'*/ \
michael@0 255 __asm psubw mm4,mm3 \
michael@0 256 __asm paddw mm4,OC_8 \
michael@0 257 /*r3=D'+D'*/ \
michael@0 258 __asm paddw mm3,mm3 \
michael@0 259 /*r3=R3=E'+D'*/ \
michael@0 260 __asm paddw mm3,mm4 \
michael@0 261 /*r4=NR4*/ \
michael@0 262 __asm psraw mm4,4 \
michael@0 263 /*r6=R6=F'-B''*/ \
michael@0 264 __asm psubw mm6,mm5 \
michael@0 265 /*r3=NR3*/ \
michael@0 266 __asm psraw mm3,4 \
michael@0 267 __asm paddw mm6,OC_8 \
michael@0 268 /*r5=B''+B''*/ \
michael@0 269 __asm paddw mm5,mm5 \
michael@0 270 /*r5=R5=F'+B''*/ \
michael@0 271 __asm paddw mm5,mm6 \
michael@0 272 /*r6=NR6*/ \
michael@0 273 __asm psraw mm6,4 \
michael@0 274 /*Store NR4 at J(4).*/ \
michael@0 275 __asm movq OC_J(4,_y),mm4 \
michael@0 276 /*r5=NR5*/ \
michael@0 277 __asm psraw mm5,4 \
michael@0 278 /*Store NR3 at I(3).*/ \
michael@0 279 __asm movq OC_I(3,_y),mm3 \
michael@0 280 /*r7=R7=G'-C'*/ \
michael@0 281 __asm psubw mm7,mm0 \
michael@0 282 __asm paddw mm7,OC_8 \
michael@0 283 /*r0=C'+C'*/ \
michael@0 284 __asm paddw mm0,mm0 \
michael@0 285 /*r0=R0=G'+C'*/ \
michael@0 286 __asm paddw mm0,mm7 \
michael@0 287 /*r7=NR7*/ \
michael@0 288 __asm psraw mm7,4 \
michael@0 289 /*Store NR6 at J(6).*/ \
michael@0 290 __asm movq OC_J(6,_y),mm6 \
michael@0 291 /*r0=NR0*/ \
michael@0 292 __asm psraw mm0,4 \
michael@0 293 /*Store NR5 at J(5).*/ \
michael@0 294 __asm movq OC_J(5,_y),mm5 \
michael@0 295 /*Store NR7 at J(7).*/ \
michael@0 296 __asm movq OC_J(7,_y),mm7 \
michael@0 297 /*Store NR0 at I(0).*/ \
michael@0 298 __asm movq OC_I(0,_y),mm0 \
michael@0 299 }
michael@0 300
michael@0 301 #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8]
michael@0 302 #define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1)
michael@0 303 #define OC_8 OC_MID(OC_EIGHT_OFFSET,0)
michael@0 304
michael@0 305 static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){
michael@0 306 int i;
michael@0 307 /*This routine accepts an 8x8 matrix, but in partially transposed form.
michael@0 308 Every 4x4 block is transposed.*/
michael@0 309 __asm{
michael@0 310 #define CONSTS eax
michael@0 311 #define Y edx
michael@0 312 #define X ecx
michael@0 313 mov CONSTS,offset OC_IDCT_CONSTS
michael@0 314 mov Y,_y
michael@0 315 mov X,_x
michael@0 316 #define OC_I(_k,_y) [(_y)+(_k)*16]
michael@0 317 #define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
michael@0 318 OC_ROW_IDCT(Y,X)
michael@0 319 OC_TRANSPOSE(Y)
michael@0 320 #undef OC_I
michael@0 321 #undef OC_J
michael@0 322 #define OC_I(_k,_y) [(_y)+(_k)*16+64]
michael@0 323 #define OC_J(_k,_y) [(_y)+((_k)-4)*16+72]
michael@0 324 OC_ROW_IDCT(Y,X)
michael@0 325 OC_TRANSPOSE(Y)
michael@0 326 #undef OC_I
michael@0 327 #undef OC_J
michael@0 328 #define OC_I(_k,_y) [(_y)+(_k)*16]
michael@0 329 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 330 OC_COLUMN_IDCT(Y)
michael@0 331 #undef OC_I
michael@0 332 #undef OC_J
michael@0 333 #define OC_I(_k,_y) [(_y)+(_k)*16+8]
michael@0 334 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 335 OC_COLUMN_IDCT(Y)
michael@0 336 #undef OC_I
michael@0 337 #undef OC_J
michael@0 338 #undef CONSTS
michael@0 339 #undef Y
michael@0 340 #undef X
michael@0 341 }
michael@0 342 if(_x!=_y){
michael@0 343 int i;
michael@0 344 __asm pxor mm0,mm0;
michael@0 345 for(i=0;i<4;i++){
michael@0 346 ogg_int16_t *x;
michael@0 347 x=_x+16*i;
michael@0 348 #define X ecx
michael@0 349 __asm{
michael@0 350 mov X,x
michael@0 351 movq [X+0x00],mm0
michael@0 352 movq [X+0x08],mm0
michael@0 353 movq [X+0x10],mm0
michael@0 354 movq [X+0x18],mm0
michael@0 355 }
michael@0 356 #undef X
michael@0 357 }
michael@0 358 }
michael@0 359 }
michael@0 360
michael@0 361 /*25 cycles.*/
michael@0 362 #define OC_IDCT_BEGIN_10(_y,_x) __asm{ \
michael@0 363 __asm movq mm2,OC_I(3,_x) \
michael@0 364 __asm nop \
michael@0 365 __asm movq mm6,OC_C(3) \
michael@0 366 __asm movq mm4,mm2 \
michael@0 367 __asm movq mm1,OC_C(5) \
michael@0 368 __asm pmulhw mm4,mm6 \
michael@0 369 __asm movq mm3,OC_I(1,_x) \
michael@0 370 __asm pmulhw mm1,mm2 \
michael@0 371 __asm movq mm0,OC_C(1) \
michael@0 372 __asm paddw mm4,mm2 \
michael@0 373 __asm pxor mm6,mm6 \
michael@0 374 __asm paddw mm2,mm1 \
michael@0 375 __asm movq mm5,OC_I(2,_x) \
michael@0 376 __asm pmulhw mm0,mm3 \
michael@0 377 __asm movq mm1,mm5 \
michael@0 378 __asm paddw mm0,mm3 \
michael@0 379 __asm pmulhw mm3,OC_C(7) \
michael@0 380 __asm psubw mm6,mm2 \
michael@0 381 __asm pmulhw mm5,OC_C(2) \
michael@0 382 __asm psubw mm0,mm4 \
michael@0 383 __asm movq mm7,OC_I(2,_x) \
michael@0 384 __asm paddw mm4,mm4 \
michael@0 385 __asm paddw mm7,mm5 \
michael@0 386 __asm paddw mm4,mm0 \
michael@0 387 __asm pmulhw mm1,OC_C(6) \
michael@0 388 __asm psubw mm3,mm6 \
michael@0 389 __asm movq OC_I(1,_y),mm4 \
michael@0 390 __asm paddw mm6,mm6 \
michael@0 391 __asm movq mm4,OC_C(4) \
michael@0 392 __asm paddw mm6,mm3 \
michael@0 393 __asm movq mm5,mm3 \
michael@0 394 __asm pmulhw mm3,mm4 \
michael@0 395 __asm movq OC_I(2,_y),mm6 \
michael@0 396 __asm movq mm2,mm0 \
michael@0 397 __asm movq mm6,OC_I(0,_x) \
michael@0 398 __asm pmulhw mm0,mm4 \
michael@0 399 __asm paddw mm5,mm3 \
michael@0 400 __asm paddw mm2,mm0 \
michael@0 401 __asm psubw mm5,mm1 \
michael@0 402 __asm pmulhw mm6,mm4 \
michael@0 403 __asm paddw mm6,OC_I(0,_x) \
michael@0 404 __asm paddw mm1,mm1 \
michael@0 405 __asm movq mm4,mm6 \
michael@0 406 __asm paddw mm1,mm5 \
michael@0 407 __asm psubw mm6,mm2 \
michael@0 408 __asm paddw mm2,mm2 \
michael@0 409 __asm movq mm0,OC_I(1,_y) \
michael@0 410 __asm paddw mm2,mm6 \
michael@0 411 __asm psubw mm2,mm1 \
michael@0 412 __asm nop \
michael@0 413 }
michael@0 414
michael@0 415 /*25+8=33 cycles.*/
michael@0 416 #define OC_ROW_IDCT_10(_y,_x) __asm{ \
michael@0 417 OC_IDCT_BEGIN_10(_y,_x) \
michael@0 418 /*r3=D'*/ \
michael@0 419 __asm movq mm3,OC_I(2,_y) \
michael@0 420 /*r4=E'=E-G*/ \
michael@0 421 __asm psubw mm4,mm7 \
michael@0 422 /*r1=H'+H'*/ \
michael@0 423 __asm paddw mm1,mm1 \
michael@0 424 /*r7=G+G*/ \
michael@0 425 __asm paddw mm7,mm7 \
michael@0 426 /*r1=R1=A''+H'*/ \
michael@0 427 __asm paddw mm1,mm2 \
michael@0 428 /*r7=G'=E+G*/ \
michael@0 429 __asm paddw mm7,mm4 \
michael@0 430 /*r4=R4=E'-D'*/ \
michael@0 431 __asm psubw mm4,mm3 \
michael@0 432 __asm paddw mm3,mm3 \
michael@0 433 /*r6=R6=F'-B''*/ \
michael@0 434 __asm psubw mm6,mm5 \
michael@0 435 __asm paddw mm5,mm5 \
michael@0 436 /*r3=R3=E'+D'*/ \
michael@0 437 __asm paddw mm3,mm4 \
michael@0 438 /*r5=R5=F'+B''*/ \
michael@0 439 __asm paddw mm5,mm6 \
michael@0 440 /*r7=R7=G'-C'*/ \
michael@0 441 __asm psubw mm7,mm0 \
michael@0 442 __asm paddw mm0,mm0 \
michael@0 443 /*Save R1.*/ \
michael@0 444 __asm movq OC_I(1,_y),mm1 \
michael@0 445 /*r0=R0=G'+C'*/ \
michael@0 446 __asm paddw mm0,mm7 \
michael@0 447 }
michael@0 448
michael@0 449 /*25+19=44 cycles'*/
michael@0 450 #define OC_COLUMN_IDCT_10(_y) __asm{ \
michael@0 451 OC_IDCT_BEGIN_10(_y,_y) \
michael@0 452 __asm paddw mm2,OC_8 \
michael@0 453 /*r1=H'+H'*/ \
michael@0 454 __asm paddw mm1,mm1 \
michael@0 455 /*r1=R1=A''+H'*/ \
michael@0 456 __asm paddw mm1,mm2 \
michael@0 457 /*r2=NR2*/ \
michael@0 458 __asm psraw mm2,4 \
michael@0 459 /*r4=E'=E-G*/ \
michael@0 460 __asm psubw mm4,mm7 \
michael@0 461 /*r1=NR1*/ \
michael@0 462 __asm psraw mm1,4 \
michael@0 463 /*r3=D'*/ \
michael@0 464 __asm movq mm3,OC_I(2,_y) \
michael@0 465 /*r7=G+G*/ \
michael@0 466 __asm paddw mm7,mm7 \
michael@0 467 /*Store NR2 at I(2).*/ \
michael@0 468 __asm movq OC_I(2,_y),mm2 \
michael@0 469 /*r7=G'=E+G*/ \
michael@0 470 __asm paddw mm7,mm4 \
michael@0 471 /*Store NR1 at I(1).*/ \
michael@0 472 __asm movq OC_I(1,_y),mm1 \
michael@0 473 /*r4=R4=E'-D'*/ \
michael@0 474 __asm psubw mm4,mm3 \
michael@0 475 __asm paddw mm4,OC_8 \
michael@0 476 /*r3=D'+D'*/ \
michael@0 477 __asm paddw mm3,mm3 \
michael@0 478 /*r3=R3=E'+D'*/ \
michael@0 479 __asm paddw mm3,mm4 \
michael@0 480 /*r4=NR4*/ \
michael@0 481 __asm psraw mm4,4 \
michael@0 482 /*r6=R6=F'-B''*/ \
michael@0 483 __asm psubw mm6,mm5 \
michael@0 484 /*r3=NR3*/ \
michael@0 485 __asm psraw mm3,4 \
michael@0 486 __asm paddw mm6,OC_8 \
michael@0 487 /*r5=B''+B''*/ \
michael@0 488 __asm paddw mm5,mm5 \
michael@0 489 /*r5=R5=F'+B''*/ \
michael@0 490 __asm paddw mm5,mm6 \
michael@0 491 /*r6=NR6*/ \
michael@0 492 __asm psraw mm6,4 \
michael@0 493 /*Store NR4 at J(4).*/ \
michael@0 494 __asm movq OC_J(4,_y),mm4 \
michael@0 495 /*r5=NR5*/ \
michael@0 496 __asm psraw mm5,4 \
michael@0 497 /*Store NR3 at I(3).*/ \
michael@0 498 __asm movq OC_I(3,_y),mm3 \
michael@0 499 /*r7=R7=G'-C'*/ \
michael@0 500 __asm psubw mm7,mm0 \
michael@0 501 __asm paddw mm7,OC_8 \
michael@0 502 /*r0=C'+C'*/ \
michael@0 503 __asm paddw mm0,mm0 \
michael@0 504 /*r0=R0=G'+C'*/ \
michael@0 505 __asm paddw mm0,mm7 \
michael@0 506 /*r7=NR7*/ \
michael@0 507 __asm psraw mm7,4 \
michael@0 508 /*Store NR6 at J(6).*/ \
michael@0 509 __asm movq OC_J(6,_y),mm6 \
michael@0 510 /*r0=NR0*/ \
michael@0 511 __asm psraw mm0,4 \
michael@0 512 /*Store NR5 at J(5).*/ \
michael@0 513 __asm movq OC_J(5,_y),mm5 \
michael@0 514 /*Store NR7 at J(7).*/ \
michael@0 515 __asm movq OC_J(7,_y),mm7 \
michael@0 516 /*Store NR0 at I(0).*/ \
michael@0 517 __asm movq OC_I(0,_y),mm0 \
michael@0 518 }
michael@0 519
michael@0 520 static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){
michael@0 521 __asm{
michael@0 522 #define CONSTS eax
michael@0 523 #define Y edx
michael@0 524 #define X ecx
michael@0 525 mov CONSTS,offset OC_IDCT_CONSTS
michael@0 526 mov Y,_y
michael@0 527 mov X,_x
michael@0 528 #define OC_I(_k,_y) [(_y)+(_k)*16]
michael@0 529 #define OC_J(_k,_y) [(_y)+((_k)-4)*16+8]
michael@0 530 /*Done with dequant, descramble, and partial transpose.
michael@0 531 Now do the iDCT itself.*/
michael@0 532 OC_ROW_IDCT_10(Y,X)
michael@0 533 OC_TRANSPOSE(Y)
michael@0 534 #undef OC_I
michael@0 535 #undef OC_J
michael@0 536 #define OC_I(_k,_y) [(_y)+(_k)*16]
michael@0 537 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 538 OC_COLUMN_IDCT_10(Y)
michael@0 539 #undef OC_I
michael@0 540 #undef OC_J
michael@0 541 #define OC_I(_k,_y) [(_y)+(_k)*16+8]
michael@0 542 #define OC_J(_k,_y) OC_I(_k,_y)
michael@0 543 OC_COLUMN_IDCT_10(Y)
michael@0 544 #undef OC_I
michael@0 545 #undef OC_J
michael@0 546 #undef CONSTS
michael@0 547 #undef Y
michael@0 548 #undef X
michael@0 549 }
michael@0 550 if(_x!=_y){
michael@0 551 #define X ecx
michael@0 552 __asm{
michael@0 553 pxor mm0,mm0;
michael@0 554 mov X,_x
michael@0 555 movq [X+0x00],mm0
michael@0 556 movq [X+0x10],mm0
michael@0 557 movq [X+0x20],mm0
michael@0 558 movq [X+0x30],mm0
michael@0 559 }
michael@0 560 #undef X
michael@0 561 }
michael@0 562 }
michael@0 563
michael@0 564 /*Performs an inverse 8x8 Type-II DCT transform.
michael@0 565 The input is assumed to be scaled by a factor of 4 relative to orthonormal
michael@0 566 version of the transform.*/
michael@0 567 void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){
michael@0 568 /*_last_zzi is subtly different from an actual count of the number of
michael@0 569 coefficients we decoded for this block.
michael@0 570 It contains the value of zzi BEFORE the final token in the block was
michael@0 571 decoded.
michael@0 572 In most cases this is an EOB token (the continuation of an EOB run from a
michael@0 573 previous block counts), and so this is the same as the coefficient count.
michael@0 574 However, in the case that the last token was NOT an EOB token, but filled
michael@0 575 the block up with exactly 64 coefficients, _last_zzi will be less than 64.
michael@0 576 Provided the last token was not a pure zero run, the minimum value it can
michael@0 577 be is 46, and so that doesn't affect any of the cases in this routine.
michael@0 578 However, if the last token WAS a pure zero run of length 63, then _last_zzi
michael@0 579 will be 1 while the number of coefficients decoded is 64.
michael@0 580 Thus, we will trigger the following special case, where the real
michael@0 581 coefficient count would not.
michael@0 582 Note also that a zero run of length 64 will give _last_zzi a value of 0,
michael@0 583 but we still process the DC coefficient, which might have a non-zero value
michael@0 584 due to DC prediction.
michael@0 585 Although convoluted, this is arguably the correct behavior: it allows us to
michael@0 586 use a smaller transform when the block ends with a long zero run instead
michael@0 587 of a normal EOB token.
michael@0 588 It could be smarter... multiple separate zero runs at the end of a block
michael@0 589 will fool it, but an encoder that generates these really deserves what it
michael@0 590 gets.
michael@0 591 Needless to say we inherited this approach from VP3.*/
michael@0 592 /*Perform the iDCT.*/
michael@0 593 if(_last_zzi<=10)oc_idct8x8_10(_y,_x);
michael@0 594 else oc_idct8x8_slow(_y,_x);
michael@0 595 }
michael@0 596
michael@0 597 #endif

mercurial