|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 /*MMX acceleration of Theora's iDCT. |
|
19 Originally written by Rudolf Marek, based on code from On2's VP3.*/ |
|
20 #include "x86int.h" |
|
21 #include "../dct.h" |
|
22 |
|
23 #if defined(OC_X86_ASM) |
|
24 |
|
25 /*These are offsets into the table of constants below.*/ |
|
26 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ |
|
27 #define OC_COSINE_OFFSET (8) |
|
28 /*A row of 8's.*/ |
|
29 #define OC_EIGHT_OFFSET (0) |
|
30 |
|
31 |
|
32 |
|
33 /*A table of constants used by the MMX routines.*/ |
|
34 static const OC_ALIGN16(ogg_uint16_t) OC_IDCT_CONSTS[(1+7)*4]={ |
|
35 8, 8, 8, 8, |
|
36 (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, |
|
37 (ogg_uint16_t)OC_C1S7,(ogg_uint16_t)OC_C1S7, |
|
38 (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, |
|
39 (ogg_uint16_t)OC_C2S6,(ogg_uint16_t)OC_C2S6, |
|
40 (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, |
|
41 (ogg_uint16_t)OC_C3S5,(ogg_uint16_t)OC_C3S5, |
|
42 (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, |
|
43 (ogg_uint16_t)OC_C4S4,(ogg_uint16_t)OC_C4S4, |
|
44 (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, |
|
45 (ogg_uint16_t)OC_C5S3,(ogg_uint16_t)OC_C5S3, |
|
46 (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, |
|
47 (ogg_uint16_t)OC_C6S2,(ogg_uint16_t)OC_C6S2, |
|
48 (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1, |
|
49 (ogg_uint16_t)OC_C7S1,(ogg_uint16_t)OC_C7S1 |
|
50 }; |
|
51 |
|
52 /*38 cycles*/ |
|
53 #define OC_IDCT_BEGIN(_y,_x) __asm{ \ |
|
54 __asm movq mm2,OC_I(3,_x) \ |
|
55 __asm movq mm6,OC_C(3) \ |
|
56 __asm movq mm4,mm2 \ |
|
57 __asm movq mm7,OC_J(5,_x) \ |
|
58 __asm pmulhw mm4,mm6 \ |
|
59 __asm movq mm1,OC_C(5) \ |
|
60 __asm pmulhw mm6,mm7 \ |
|
61 __asm movq mm5,mm1 \ |
|
62 __asm pmulhw mm1,mm2 \ |
|
63 __asm movq mm3,OC_I(1,_x) \ |
|
64 __asm pmulhw mm5,mm7 \ |
|
65 __asm movq mm0,OC_C(1) \ |
|
66 __asm paddw mm4,mm2 \ |
|
67 __asm paddw mm6,mm7 \ |
|
68 __asm paddw mm2,mm1 \ |
|
69 __asm movq mm1,OC_J(7,_x) \ |
|
70 __asm paddw mm7,mm5 \ |
|
71 __asm movq mm5,mm0 \ |
|
72 __asm pmulhw mm0,mm3 \ |
|
73 __asm paddw mm4,mm7 \ |
|
74 __asm pmulhw mm5,mm1 \ |
|
75 __asm movq mm7,OC_C(7) \ |
|
76 __asm psubw mm6,mm2 \ |
|
77 __asm paddw mm0,mm3 \ |
|
78 __asm pmulhw mm3,mm7 \ |
|
79 __asm movq mm2,OC_I(2,_x) \ |
|
80 __asm pmulhw mm7,mm1 \ |
|
81 __asm paddw mm5,mm1 \ |
|
82 __asm movq mm1,mm2 \ |
|
83 __asm pmulhw mm2,OC_C(2) \ |
|
84 __asm psubw mm3,mm5 \ |
|
85 __asm movq mm5,OC_J(6,_x) \ |
|
86 __asm paddw mm0,mm7 \ |
|
87 __asm movq mm7,mm5 \ |
|
88 __asm psubw mm0,mm4 \ |
|
89 __asm pmulhw mm5,OC_C(2) \ |
|
90 __asm paddw mm2,mm1 \ |
|
91 __asm pmulhw mm1,OC_C(6) \ |
|
92 __asm paddw mm4,mm4 \ |
|
93 __asm paddw mm4,mm0 \ |
|
94 __asm psubw mm3,mm6 \ |
|
95 __asm paddw mm5,mm7 \ |
|
96 __asm paddw mm6,mm6 \ |
|
97 __asm pmulhw mm7,OC_C(6) \ |
|
98 __asm paddw mm6,mm3 \ |
|
99 __asm movq OC_I(1,_y),mm4 \ |
|
100 __asm psubw mm1,mm5 \ |
|
101 __asm movq mm4,OC_C(4) \ |
|
102 __asm movq mm5,mm3 \ |
|
103 __asm pmulhw mm3,mm4 \ |
|
104 __asm paddw mm7,mm2 \ |
|
105 __asm movq OC_I(2,_y),mm6 \ |
|
106 __asm movq mm2,mm0 \ |
|
107 __asm movq mm6,OC_I(0,_x) \ |
|
108 __asm pmulhw mm0,mm4 \ |
|
109 __asm paddw mm5,mm3 \ |
|
110 __asm movq mm3,OC_J(4,_x) \ |
|
111 __asm psubw mm5,mm1 \ |
|
112 __asm paddw mm2,mm0 \ |
|
113 __asm psubw mm6,mm3 \ |
|
114 __asm movq mm0,mm6 \ |
|
115 __asm pmulhw mm6,mm4 \ |
|
116 __asm paddw mm3,mm3 \ |
|
117 __asm paddw mm1,mm1 \ |
|
118 __asm paddw mm3,mm0 \ |
|
119 __asm paddw mm1,mm5 \ |
|
120 __asm pmulhw mm4,mm3 \ |
|
121 __asm paddw mm6,mm0 \ |
|
122 __asm psubw mm6,mm2 \ |
|
123 __asm paddw mm2,mm2 \ |
|
124 __asm movq mm0,OC_I(1,_y) \ |
|
125 __asm paddw mm2,mm6 \ |
|
126 __asm paddw mm4,mm3 \ |
|
127 __asm psubw mm2,mm1 \ |
|
128 } |
|
129 |
|
130 /*38+8=46 cycles.*/ |
|
131 #define OC_ROW_IDCT(_y,_x) __asm{ \ |
|
132 OC_IDCT_BEGIN(_y,_x) \ |
|
133 /*r3=D'*/ \ |
|
134 __asm movq mm3,OC_I(2,_y) \ |
|
135 /*r4=E'=E-G*/ \ |
|
136 __asm psubw mm4,mm7 \ |
|
137 /*r1=H'+H'*/ \ |
|
138 __asm paddw mm1,mm1 \ |
|
139 /*r7=G+G*/ \ |
|
140 __asm paddw mm7,mm7 \ |
|
141 /*r1=R1=A''+H'*/ \ |
|
142 __asm paddw mm1,mm2 \ |
|
143 /*r7=G'=E+G*/ \ |
|
144 __asm paddw mm7,mm4 \ |
|
145 /*r4=R4=E'-D'*/ \ |
|
146 __asm psubw mm4,mm3 \ |
|
147 __asm paddw mm3,mm3 \ |
|
148 /*r6=R6=F'-B''*/ \ |
|
149 __asm psubw mm6,mm5 \ |
|
150 __asm paddw mm5,mm5 \ |
|
151 /*r3=R3=E'+D'*/ \ |
|
152 __asm paddw mm3,mm4 \ |
|
153 /*r5=R5=F'+B''*/ \ |
|
154 __asm paddw mm5,mm6 \ |
|
155 /*r7=R7=G'-C'*/ \ |
|
156 __asm psubw mm7,mm0 \ |
|
157 __asm paddw mm0,mm0 \ |
|
158 /*Save R1.*/ \ |
|
159 __asm movq OC_I(1,_y),mm1 \ |
|
160 /*r0=R0=G.+C.*/ \ |
|
161 __asm paddw mm0,mm7 \ |
|
162 } |
|
163 |
|
164 /*The following macro does two 4x4 transposes in place. |
|
165 At entry, we assume: |
|
166 r0 = a3 a2 a1 a0 |
|
167 I(1) = b3 b2 b1 b0 |
|
168 r2 = c3 c2 c1 c0 |
|
169 r3 = d3 d2 d1 d0 |
|
170 |
|
171 r4 = e3 e2 e1 e0 |
|
172 r5 = f3 f2 f1 f0 |
|
173 r6 = g3 g2 g1 g0 |
|
174 r7 = h3 h2 h1 h0 |
|
175 |
|
176 At exit, we have: |
|
177 I(0) = d0 c0 b0 a0 |
|
178 I(1) = d1 c1 b1 a1 |
|
179 I(2) = d2 c2 b2 a2 |
|
180 I(3) = d3 c3 b3 a3 |
|
181 |
|
182 J(4) = h0 g0 f0 e0 |
|
183 J(5) = h1 g1 f1 e1 |
|
184 J(6) = h2 g2 f2 e2 |
|
185 J(7) = h3 g3 f3 e3 |
|
186 |
|
187 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
|
188 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
|
189 |
|
190 Since r1 is free at entry, we calculate the Js first.*/ |
|
191 /*19 cycles.*/ |
|
192 #define OC_TRANSPOSE(_y) __asm{ \ |
|
193 __asm movq mm1,mm4 \ |
|
194 __asm punpcklwd mm4,mm5 \ |
|
195 __asm movq OC_I(0,_y),mm0 \ |
|
196 __asm punpckhwd mm1,mm5 \ |
|
197 __asm movq mm0,mm6 \ |
|
198 __asm punpcklwd mm6,mm7 \ |
|
199 __asm movq mm5,mm4 \ |
|
200 __asm punpckldq mm4,mm6 \ |
|
201 __asm punpckhdq mm5,mm6 \ |
|
202 __asm movq mm6,mm1 \ |
|
203 __asm movq OC_J(4,_y),mm4 \ |
|
204 __asm punpckhwd mm0,mm7 \ |
|
205 __asm movq OC_J(5,_y),mm5 \ |
|
206 __asm punpckhdq mm6,mm0 \ |
|
207 __asm movq mm4,OC_I(0,_y) \ |
|
208 __asm punpckldq mm1,mm0 \ |
|
209 __asm movq mm5,OC_I(1,_y) \ |
|
210 __asm movq mm0,mm4 \ |
|
211 __asm movq OC_J(7,_y),mm6 \ |
|
212 __asm punpcklwd mm0,mm5 \ |
|
213 __asm movq OC_J(6,_y),mm1 \ |
|
214 __asm punpckhwd mm4,mm5 \ |
|
215 __asm movq mm5,mm2 \ |
|
216 __asm punpcklwd mm2,mm3 \ |
|
217 __asm movq mm1,mm0 \ |
|
218 __asm punpckldq mm0,mm2 \ |
|
219 __asm punpckhdq mm1,mm2 \ |
|
220 __asm movq mm2,mm4 \ |
|
221 __asm movq OC_I(0,_y),mm0 \ |
|
222 __asm punpckhwd mm5,mm3 \ |
|
223 __asm movq OC_I(1,_y),mm1 \ |
|
224 __asm punpckhdq mm4,mm5 \ |
|
225 __asm punpckldq mm2,mm5 \ |
|
226 __asm movq OC_I(3,_y),mm4 \ |
|
227 __asm movq OC_I(2,_y),mm2 \ |
|
228 } |
|
229 |
|
230 /*38+19=57 cycles.*/ |
|
231 #define OC_COLUMN_IDCT(_y) __asm{ \ |
|
232 OC_IDCT_BEGIN(_y,_y) \ |
|
233 __asm paddw mm2,OC_8 \ |
|
234 /*r1=H'+H'*/ \ |
|
235 __asm paddw mm1,mm1 \ |
|
236 /*r1=R1=A''+H'*/ \ |
|
237 __asm paddw mm1,mm2 \ |
|
238 /*r2=NR2*/ \ |
|
239 __asm psraw mm2,4 \ |
|
240 /*r4=E'=E-G*/ \ |
|
241 __asm psubw mm4,mm7 \ |
|
242 /*r1=NR1*/ \ |
|
243 __asm psraw mm1,4 \ |
|
244 /*r3=D'*/ \ |
|
245 __asm movq mm3,OC_I(2,_y) \ |
|
246 /*r7=G+G*/ \ |
|
247 __asm paddw mm7,mm7 \ |
|
248 /*Store NR2 at I(2).*/ \ |
|
249 __asm movq OC_I(2,_y),mm2 \ |
|
250 /*r7=G'=E+G*/ \ |
|
251 __asm paddw mm7,mm4 \ |
|
252 /*Store NR1 at I(1).*/ \ |
|
253 __asm movq OC_I(1,_y),mm1 \ |
|
254 /*r4=R4=E'-D'*/ \ |
|
255 __asm psubw mm4,mm3 \ |
|
256 __asm paddw mm4,OC_8 \ |
|
257 /*r3=D'+D'*/ \ |
|
258 __asm paddw mm3,mm3 \ |
|
259 /*r3=R3=E'+D'*/ \ |
|
260 __asm paddw mm3,mm4 \ |
|
261 /*r4=NR4*/ \ |
|
262 __asm psraw mm4,4 \ |
|
263 /*r6=R6=F'-B''*/ \ |
|
264 __asm psubw mm6,mm5 \ |
|
265 /*r3=NR3*/ \ |
|
266 __asm psraw mm3,4 \ |
|
267 __asm paddw mm6,OC_8 \ |
|
268 /*r5=B''+B''*/ \ |
|
269 __asm paddw mm5,mm5 \ |
|
270 /*r5=R5=F'+B''*/ \ |
|
271 __asm paddw mm5,mm6 \ |
|
272 /*r6=NR6*/ \ |
|
273 __asm psraw mm6,4 \ |
|
274 /*Store NR4 at J(4).*/ \ |
|
275 __asm movq OC_J(4,_y),mm4 \ |
|
276 /*r5=NR5*/ \ |
|
277 __asm psraw mm5,4 \ |
|
278 /*Store NR3 at I(3).*/ \ |
|
279 __asm movq OC_I(3,_y),mm3 \ |
|
280 /*r7=R7=G'-C'*/ \ |
|
281 __asm psubw mm7,mm0 \ |
|
282 __asm paddw mm7,OC_8 \ |
|
283 /*r0=C'+C'*/ \ |
|
284 __asm paddw mm0,mm0 \ |
|
285 /*r0=R0=G'+C'*/ \ |
|
286 __asm paddw mm0,mm7 \ |
|
287 /*r7=NR7*/ \ |
|
288 __asm psraw mm7,4 \ |
|
289 /*Store NR6 at J(6).*/ \ |
|
290 __asm movq OC_J(6,_y),mm6 \ |
|
291 /*r0=NR0*/ \ |
|
292 __asm psraw mm0,4 \ |
|
293 /*Store NR5 at J(5).*/ \ |
|
294 __asm movq OC_J(5,_y),mm5 \ |
|
295 /*Store NR7 at J(7).*/ \ |
|
296 __asm movq OC_J(7,_y),mm7 \ |
|
297 /*Store NR0 at I(0).*/ \ |
|
298 __asm movq OC_I(0,_y),mm0 \ |
|
299 } |
|
300 |
|
301 #define OC_MID(_m,_i) [CONSTS+_m+(_i)*8] |
|
302 #define OC_C(_i) OC_MID(OC_COSINE_OFFSET,_i-1) |
|
303 #define OC_8 OC_MID(OC_EIGHT_OFFSET,0) |
|
304 |
|
305 static void oc_idct8x8_slow(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
|
306 int i; |
|
307 /*This routine accepts an 8x8 matrix, but in partially transposed form. |
|
308 Every 4x4 block is transposed.*/ |
|
309 __asm{ |
|
310 #define CONSTS eax |
|
311 #define Y edx |
|
312 #define X ecx |
|
313 mov CONSTS,offset OC_IDCT_CONSTS |
|
314 mov Y,_y |
|
315 mov X,_x |
|
316 #define OC_I(_k,_y) [(_y)+(_k)*16] |
|
317 #define OC_J(_k,_y) [(_y)+((_k)-4)*16+8] |
|
318 OC_ROW_IDCT(Y,X) |
|
319 OC_TRANSPOSE(Y) |
|
320 #undef OC_I |
|
321 #undef OC_J |
|
322 #define OC_I(_k,_y) [(_y)+(_k)*16+64] |
|
323 #define OC_J(_k,_y) [(_y)+((_k)-4)*16+72] |
|
324 OC_ROW_IDCT(Y,X) |
|
325 OC_TRANSPOSE(Y) |
|
326 #undef OC_I |
|
327 #undef OC_J |
|
328 #define OC_I(_k,_y) [(_y)+(_k)*16] |
|
329 #define OC_J(_k,_y) OC_I(_k,_y) |
|
330 OC_COLUMN_IDCT(Y) |
|
331 #undef OC_I |
|
332 #undef OC_J |
|
333 #define OC_I(_k,_y) [(_y)+(_k)*16+8] |
|
334 #define OC_J(_k,_y) OC_I(_k,_y) |
|
335 OC_COLUMN_IDCT(Y) |
|
336 #undef OC_I |
|
337 #undef OC_J |
|
338 #undef CONSTS |
|
339 #undef Y |
|
340 #undef X |
|
341 } |
|
342 if(_x!=_y){ |
|
343 int i; |
|
344 __asm pxor mm0,mm0; |
|
345 for(i=0;i<4;i++){ |
|
346 ogg_int16_t *x; |
|
347 x=_x+16*i; |
|
348 #define X ecx |
|
349 __asm{ |
|
350 mov X,x |
|
351 movq [X+0x00],mm0 |
|
352 movq [X+0x08],mm0 |
|
353 movq [X+0x10],mm0 |
|
354 movq [X+0x18],mm0 |
|
355 } |
|
356 #undef X |
|
357 } |
|
358 } |
|
359 } |
|
360 |
|
361 /*25 cycles.*/ |
|
362 #define OC_IDCT_BEGIN_10(_y,_x) __asm{ \ |
|
363 __asm movq mm2,OC_I(3,_x) \ |
|
364 __asm nop \ |
|
365 __asm movq mm6,OC_C(3) \ |
|
366 __asm movq mm4,mm2 \ |
|
367 __asm movq mm1,OC_C(5) \ |
|
368 __asm pmulhw mm4,mm6 \ |
|
369 __asm movq mm3,OC_I(1,_x) \ |
|
370 __asm pmulhw mm1,mm2 \ |
|
371 __asm movq mm0,OC_C(1) \ |
|
372 __asm paddw mm4,mm2 \ |
|
373 __asm pxor mm6,mm6 \ |
|
374 __asm paddw mm2,mm1 \ |
|
375 __asm movq mm5,OC_I(2,_x) \ |
|
376 __asm pmulhw mm0,mm3 \ |
|
377 __asm movq mm1,mm5 \ |
|
378 __asm paddw mm0,mm3 \ |
|
379 __asm pmulhw mm3,OC_C(7) \ |
|
380 __asm psubw mm6,mm2 \ |
|
381 __asm pmulhw mm5,OC_C(2) \ |
|
382 __asm psubw mm0,mm4 \ |
|
383 __asm movq mm7,OC_I(2,_x) \ |
|
384 __asm paddw mm4,mm4 \ |
|
385 __asm paddw mm7,mm5 \ |
|
386 __asm paddw mm4,mm0 \ |
|
387 __asm pmulhw mm1,OC_C(6) \ |
|
388 __asm psubw mm3,mm6 \ |
|
389 __asm movq OC_I(1,_y),mm4 \ |
|
390 __asm paddw mm6,mm6 \ |
|
391 __asm movq mm4,OC_C(4) \ |
|
392 __asm paddw mm6,mm3 \ |
|
393 __asm movq mm5,mm3 \ |
|
394 __asm pmulhw mm3,mm4 \ |
|
395 __asm movq OC_I(2,_y),mm6 \ |
|
396 __asm movq mm2,mm0 \ |
|
397 __asm movq mm6,OC_I(0,_x) \ |
|
398 __asm pmulhw mm0,mm4 \ |
|
399 __asm paddw mm5,mm3 \ |
|
400 __asm paddw mm2,mm0 \ |
|
401 __asm psubw mm5,mm1 \ |
|
402 __asm pmulhw mm6,mm4 \ |
|
403 __asm paddw mm6,OC_I(0,_x) \ |
|
404 __asm paddw mm1,mm1 \ |
|
405 __asm movq mm4,mm6 \ |
|
406 __asm paddw mm1,mm5 \ |
|
407 __asm psubw mm6,mm2 \ |
|
408 __asm paddw mm2,mm2 \ |
|
409 __asm movq mm0,OC_I(1,_y) \ |
|
410 __asm paddw mm2,mm6 \ |
|
411 __asm psubw mm2,mm1 \ |
|
412 __asm nop \ |
|
413 } |
|
414 |
|
415 /*25+8=33 cycles.*/ |
|
416 #define OC_ROW_IDCT_10(_y,_x) __asm{ \ |
|
417 OC_IDCT_BEGIN_10(_y,_x) \ |
|
418 /*r3=D'*/ \ |
|
419 __asm movq mm3,OC_I(2,_y) \ |
|
420 /*r4=E'=E-G*/ \ |
|
421 __asm psubw mm4,mm7 \ |
|
422 /*r1=H'+H'*/ \ |
|
423 __asm paddw mm1,mm1 \ |
|
424 /*r7=G+G*/ \ |
|
425 __asm paddw mm7,mm7 \ |
|
426 /*r1=R1=A''+H'*/ \ |
|
427 __asm paddw mm1,mm2 \ |
|
428 /*r7=G'=E+G*/ \ |
|
429 __asm paddw mm7,mm4 \ |
|
430 /*r4=R4=E'-D'*/ \ |
|
431 __asm psubw mm4,mm3 \ |
|
432 __asm paddw mm3,mm3 \ |
|
433 /*r6=R6=F'-B''*/ \ |
|
434 __asm psubw mm6,mm5 \ |
|
435 __asm paddw mm5,mm5 \ |
|
436 /*r3=R3=E'+D'*/ \ |
|
437 __asm paddw mm3,mm4 \ |
|
438 /*r5=R5=F'+B''*/ \ |
|
439 __asm paddw mm5,mm6 \ |
|
440 /*r7=R7=G'-C'*/ \ |
|
441 __asm psubw mm7,mm0 \ |
|
442 __asm paddw mm0,mm0 \ |
|
443 /*Save R1.*/ \ |
|
444 __asm movq OC_I(1,_y),mm1 \ |
|
445 /*r0=R0=G'+C'*/ \ |
|
446 __asm paddw mm0,mm7 \ |
|
447 } |
|
448 |
|
449 /*25+19=44 cycles'*/ |
|
450 #define OC_COLUMN_IDCT_10(_y) __asm{ \ |
|
451 OC_IDCT_BEGIN_10(_y,_y) \ |
|
452 __asm paddw mm2,OC_8 \ |
|
453 /*r1=H'+H'*/ \ |
|
454 __asm paddw mm1,mm1 \ |
|
455 /*r1=R1=A''+H'*/ \ |
|
456 __asm paddw mm1,mm2 \ |
|
457 /*r2=NR2*/ \ |
|
458 __asm psraw mm2,4 \ |
|
459 /*r4=E'=E-G*/ \ |
|
460 __asm psubw mm4,mm7 \ |
|
461 /*r1=NR1*/ \ |
|
462 __asm psraw mm1,4 \ |
|
463 /*r3=D'*/ \ |
|
464 __asm movq mm3,OC_I(2,_y) \ |
|
465 /*r7=G+G*/ \ |
|
466 __asm paddw mm7,mm7 \ |
|
467 /*Store NR2 at I(2).*/ \ |
|
468 __asm movq OC_I(2,_y),mm2 \ |
|
469 /*r7=G'=E+G*/ \ |
|
470 __asm paddw mm7,mm4 \ |
|
471 /*Store NR1 at I(1).*/ \ |
|
472 __asm movq OC_I(1,_y),mm1 \ |
|
473 /*r4=R4=E'-D'*/ \ |
|
474 __asm psubw mm4,mm3 \ |
|
475 __asm paddw mm4,OC_8 \ |
|
476 /*r3=D'+D'*/ \ |
|
477 __asm paddw mm3,mm3 \ |
|
478 /*r3=R3=E'+D'*/ \ |
|
479 __asm paddw mm3,mm4 \ |
|
480 /*r4=NR4*/ \ |
|
481 __asm psraw mm4,4 \ |
|
482 /*r6=R6=F'-B''*/ \ |
|
483 __asm psubw mm6,mm5 \ |
|
484 /*r3=NR3*/ \ |
|
485 __asm psraw mm3,4 \ |
|
486 __asm paddw mm6,OC_8 \ |
|
487 /*r5=B''+B''*/ \ |
|
488 __asm paddw mm5,mm5 \ |
|
489 /*r5=R5=F'+B''*/ \ |
|
490 __asm paddw mm5,mm6 \ |
|
491 /*r6=NR6*/ \ |
|
492 __asm psraw mm6,4 \ |
|
493 /*Store NR4 at J(4).*/ \ |
|
494 __asm movq OC_J(4,_y),mm4 \ |
|
495 /*r5=NR5*/ \ |
|
496 __asm psraw mm5,4 \ |
|
497 /*Store NR3 at I(3).*/ \ |
|
498 __asm movq OC_I(3,_y),mm3 \ |
|
499 /*r7=R7=G'-C'*/ \ |
|
500 __asm psubw mm7,mm0 \ |
|
501 __asm paddw mm7,OC_8 \ |
|
502 /*r0=C'+C'*/ \ |
|
503 __asm paddw mm0,mm0 \ |
|
504 /*r0=R0=G'+C'*/ \ |
|
505 __asm paddw mm0,mm7 \ |
|
506 /*r7=NR7*/ \ |
|
507 __asm psraw mm7,4 \ |
|
508 /*Store NR6 at J(6).*/ \ |
|
509 __asm movq OC_J(6,_y),mm6 \ |
|
510 /*r0=NR0*/ \ |
|
511 __asm psraw mm0,4 \ |
|
512 /*Store NR5 at J(5).*/ \ |
|
513 __asm movq OC_J(5,_y),mm5 \ |
|
514 /*Store NR7 at J(7).*/ \ |
|
515 __asm movq OC_J(7,_y),mm7 \ |
|
516 /*Store NR0 at I(0).*/ \ |
|
517 __asm movq OC_I(0,_y),mm0 \ |
|
518 } |
|
519 |
|
520 static void oc_idct8x8_10(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
|
521 __asm{ |
|
522 #define CONSTS eax |
|
523 #define Y edx |
|
524 #define X ecx |
|
525 mov CONSTS,offset OC_IDCT_CONSTS |
|
526 mov Y,_y |
|
527 mov X,_x |
|
528 #define OC_I(_k,_y) [(_y)+(_k)*16] |
|
529 #define OC_J(_k,_y) [(_y)+((_k)-4)*16+8] |
|
530 /*Done with dequant, descramble, and partial transpose. |
|
531 Now do the iDCT itself.*/ |
|
532 OC_ROW_IDCT_10(Y,X) |
|
533 OC_TRANSPOSE(Y) |
|
534 #undef OC_I |
|
535 #undef OC_J |
|
536 #define OC_I(_k,_y) [(_y)+(_k)*16] |
|
537 #define OC_J(_k,_y) OC_I(_k,_y) |
|
538 OC_COLUMN_IDCT_10(Y) |
|
539 #undef OC_I |
|
540 #undef OC_J |
|
541 #define OC_I(_k,_y) [(_y)+(_k)*16+8] |
|
542 #define OC_J(_k,_y) OC_I(_k,_y) |
|
543 OC_COLUMN_IDCT_10(Y) |
|
544 #undef OC_I |
|
545 #undef OC_J |
|
546 #undef CONSTS |
|
547 #undef Y |
|
548 #undef X |
|
549 } |
|
550 if(_x!=_y){ |
|
551 #define X ecx |
|
552 __asm{ |
|
553 pxor mm0,mm0; |
|
554 mov X,_x |
|
555 movq [X+0x00],mm0 |
|
556 movq [X+0x10],mm0 |
|
557 movq [X+0x20],mm0 |
|
558 movq [X+0x30],mm0 |
|
559 } |
|
560 #undef X |
|
561 } |
|
562 } |
|
563 |
|
564 /*Performs an inverse 8x8 Type-II DCT transform. |
|
565 The input is assumed to be scaled by a factor of 4 relative to orthonormal |
|
566 version of the transform.*/ |
|
567 void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
|
568 /*_last_zzi is subtly different from an actual count of the number of |
|
569 coefficients we decoded for this block. |
|
570 It contains the value of zzi BEFORE the final token in the block was |
|
571 decoded. |
|
572 In most cases this is an EOB token (the continuation of an EOB run from a |
|
573 previous block counts), and so this is the same as the coefficient count. |
|
574 However, in the case that the last token was NOT an EOB token, but filled |
|
575 the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
|
576 Provided the last token was not a pure zero run, the minimum value it can |
|
577 be is 46, and so that doesn't affect any of the cases in this routine. |
|
578 However, if the last token WAS a pure zero run of length 63, then _last_zzi |
|
579 will be 1 while the number of coefficients decoded is 64. |
|
580 Thus, we will trigger the following special case, where the real |
|
581 coefficient count would not. |
|
582 Note also that a zero run of length 64 will give _last_zzi a value of 0, |
|
583 but we still process the DC coefficient, which might have a non-zero value |
|
584 due to DC prediction. |
|
585 Although convoluted, this is arguably the correct behavior: it allows us to |
|
586 use a smaller transform when the block ends with a long zero run instead |
|
587 of a normal EOB token. |
|
588 It could be smarter... multiple separate zero runs at the end of a block |
|
589 will fool it, but an encoder that generates these really deserves what it |
|
590 gets. |
|
591 Needless to say we inherited this approach from VP3.*/ |
|
592 /*Perform the iDCT.*/ |
|
593 if(_last_zzi<=10)oc_idct8x8_10(_y,_x); |
|
594 else oc_idct8x8_slow(_y,_x); |
|
595 } |
|
596 |
|
597 #endif |