|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: mmxidct.c 17446 2010-09-23 20:06:20Z tterribe $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 /*MMX acceleration of Theora's iDCT. |
|
19 Originally written by Rudolf Marek, based on code from On2's VP3.*/ |
|
20 #include "x86int.h" |
|
21 #include "../dct.h" |
|
22 |
|
23 #if defined(OC_X86_ASM) |
|
24 |
|
25 /*These are offsets into the table of constants below.*/ |
|
26 /*7 rows of cosines, in order: pi/16 * (1 ... 7).*/ |
|
27 #define OC_COSINE_OFFSET (0) |
|
28 /*A row of 8's.*/ |
|
29 #define OC_EIGHT_OFFSET (56) |
|
30 |
|
31 |
|
32 |
|
33 /*38 cycles*/ |
|
34 #define OC_IDCT_BEGIN(_y,_x) \ |
|
35 "#OC_IDCT_BEGIN\n\t" \ |
|
36 "movq "OC_I(3,_x)",%%mm2\n\t" \ |
|
37 "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ |
|
38 "movq %%mm2,%%mm4\n\t" \ |
|
39 "movq "OC_J(5,_x)",%%mm7\n\t" \ |
|
40 "pmulhw %%mm6,%%mm4\n\t" \ |
|
41 "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ |
|
42 "pmulhw %%mm7,%%mm6\n\t" \ |
|
43 "movq %%mm1,%%mm5\n\t" \ |
|
44 "pmulhw %%mm2,%%mm1\n\t" \ |
|
45 "movq "OC_I(1,_x)",%%mm3\n\t" \ |
|
46 "pmulhw %%mm7,%%mm5\n\t" \ |
|
47 "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ |
|
48 "paddw %%mm2,%%mm4\n\t" \ |
|
49 "paddw %%mm7,%%mm6\n\t" \ |
|
50 "paddw %%mm1,%%mm2\n\t" \ |
|
51 "movq "OC_J(7,_x)",%%mm1\n\t" \ |
|
52 "paddw %%mm5,%%mm7\n\t" \ |
|
53 "movq %%mm0,%%mm5\n\t" \ |
|
54 "pmulhw %%mm3,%%mm0\n\t" \ |
|
55 "paddw %%mm7,%%mm4\n\t" \ |
|
56 "pmulhw %%mm1,%%mm5\n\t" \ |
|
57 "movq "OC_MEM_OFFS(0x70,c)",%%mm7\n\t" \ |
|
58 "psubw %%mm2,%%mm6\n\t" \ |
|
59 "paddw %%mm3,%%mm0\n\t" \ |
|
60 "pmulhw %%mm7,%%mm3\n\t" \ |
|
61 "movq "OC_I(2,_x)",%%mm2\n\t" \ |
|
62 "pmulhw %%mm1,%%mm7\n\t" \ |
|
63 "paddw %%mm1,%%mm5\n\t" \ |
|
64 "movq %%mm2,%%mm1\n\t" \ |
|
65 "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm2\n\t" \ |
|
66 "psubw %%mm5,%%mm3\n\t" \ |
|
67 "movq "OC_J(6,_x)",%%mm5\n\t" \ |
|
68 "paddw %%mm7,%%mm0\n\t" \ |
|
69 "movq %%mm5,%%mm7\n\t" \ |
|
70 "psubw %%mm4,%%mm0\n\t" \ |
|
71 "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ |
|
72 "paddw %%mm1,%%mm2\n\t" \ |
|
73 "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ |
|
74 "paddw %%mm4,%%mm4\n\t" \ |
|
75 "paddw %%mm0,%%mm4\n\t" \ |
|
76 "psubw %%mm6,%%mm3\n\t" \ |
|
77 "paddw %%mm7,%%mm5\n\t" \ |
|
78 "paddw %%mm6,%%mm6\n\t" \ |
|
79 "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
|
80 "paddw %%mm3,%%mm6\n\t" \ |
|
81 "movq %%mm4,"OC_I(1,_y)"\n\t" \ |
|
82 "psubw %%mm5,%%mm1\n\t" \ |
|
83 "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
|
84 "movq %%mm3,%%mm5\n\t" \ |
|
85 "pmulhw %%mm4,%%mm3\n\t" \ |
|
86 "paddw %%mm2,%%mm7\n\t" \ |
|
87 "movq %%mm6,"OC_I(2,_y)"\n\t" \ |
|
88 "movq %%mm0,%%mm2\n\t" \ |
|
89 "movq "OC_I(0,_x)",%%mm6\n\t" \ |
|
90 "pmulhw %%mm4,%%mm0\n\t" \ |
|
91 "paddw %%mm3,%%mm5\n\t" \ |
|
92 "movq "OC_J(4,_x)",%%mm3\n\t" \ |
|
93 "psubw %%mm1,%%mm5\n\t" \ |
|
94 "paddw %%mm0,%%mm2\n\t" \ |
|
95 "psubw %%mm3,%%mm6\n\t" \ |
|
96 "movq %%mm6,%%mm0\n\t" \ |
|
97 "pmulhw %%mm4,%%mm6\n\t" \ |
|
98 "paddw %%mm3,%%mm3\n\t" \ |
|
99 "paddw %%mm1,%%mm1\n\t" \ |
|
100 "paddw %%mm0,%%mm3\n\t" \ |
|
101 "paddw %%mm5,%%mm1\n\t" \ |
|
102 "pmulhw %%mm3,%%mm4\n\t" \ |
|
103 "paddw %%mm0,%%mm6\n\t" \ |
|
104 "psubw %%mm2,%%mm6\n\t" \ |
|
105 "paddw %%mm2,%%mm2\n\t" \ |
|
106 "movq "OC_I(1,_y)",%%mm0\n\t" \ |
|
107 "paddw %%mm6,%%mm2\n\t" \ |
|
108 "paddw %%mm3,%%mm4\n\t" \ |
|
109 "psubw %%mm1,%%mm2\n\t" \ |
|
110 "#end OC_IDCT_BEGIN\n\t" \ |
|
111 |
|
112 /*38+8=46 cycles.*/ |
|
113 #define OC_ROW_IDCT(_y,_x) \ |
|
114 "#OC_ROW_IDCT\n" \ |
|
115 OC_IDCT_BEGIN(_y,_x) \ |
|
116 /*r3=D'*/ \ |
|
117 "movq "OC_I(2,_y)",%%mm3\n\t" \ |
|
118 /*r4=E'=E-G*/ \ |
|
119 "psubw %%mm7,%%mm4\n\t" \ |
|
120 /*r1=H'+H'*/ \ |
|
121 "paddw %%mm1,%%mm1\n\t" \ |
|
122 /*r7=G+G*/ \ |
|
123 "paddw %%mm7,%%mm7\n\t" \ |
|
124 /*r1=R1=A''+H'*/ \ |
|
125 "paddw %%mm2,%%mm1\n\t" \ |
|
126 /*r7=G'=E+G*/ \ |
|
127 "paddw %%mm4,%%mm7\n\t" \ |
|
128 /*r4=R4=E'-D'*/ \ |
|
129 "psubw %%mm3,%%mm4\n\t" \ |
|
130 "paddw %%mm3,%%mm3\n\t" \ |
|
131 /*r6=R6=F'-B''*/ \ |
|
132 "psubw %%mm5,%%mm6\n\t" \ |
|
133 "paddw %%mm5,%%mm5\n\t" \ |
|
134 /*r3=R3=E'+D'*/ \ |
|
135 "paddw %%mm4,%%mm3\n\t" \ |
|
136 /*r5=R5=F'+B''*/ \ |
|
137 "paddw %%mm6,%%mm5\n\t" \ |
|
138 /*r7=R7=G'-C'*/ \ |
|
139 "psubw %%mm0,%%mm7\n\t" \ |
|
140 "paddw %%mm0,%%mm0\n\t" \ |
|
141 /*Save R1.*/ \ |
|
142 "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
|
143 /*r0=R0=G.+C.*/ \ |
|
144 "paddw %%mm7,%%mm0\n\t" \ |
|
145 "#end OC_ROW_IDCT\n\t" \ |
|
146 |
|
147 /*The following macro does two 4x4 transposes in place. |
|
148 At entry, we assume: |
|
149 r0 = a3 a2 a1 a0 |
|
150 I(1) = b3 b2 b1 b0 |
|
151 r2 = c3 c2 c1 c0 |
|
152 r3 = d3 d2 d1 d0 |
|
153 |
|
154 r4 = e3 e2 e1 e0 |
|
155 r5 = f3 f2 f1 f0 |
|
156 r6 = g3 g2 g1 g0 |
|
157 r7 = h3 h2 h1 h0 |
|
158 |
|
159 At exit, we have: |
|
160 I(0) = d0 c0 b0 a0 |
|
161 I(1) = d1 c1 b1 a1 |
|
162 I(2) = d2 c2 b2 a2 |
|
163 I(3) = d3 c3 b3 a3 |
|
164 |
|
165 J(4) = h0 g0 f0 e0 |
|
166 J(5) = h1 g1 f1 e1 |
|
167 J(6) = h2 g2 f2 e2 |
|
168 J(7) = h3 g3 f3 e3 |
|
169 |
|
170 I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
|
171 J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
|
172 |
|
173 Since r1 is free at entry, we calculate the Js first.*/ |
|
174 /*19 cycles.*/ |
|
175 #define OC_TRANSPOSE(_y) \ |
|
176 "#OC_TRANSPOSE\n\t" \ |
|
177 "movq %%mm4,%%mm1\n\t" \ |
|
178 "punpcklwd %%mm5,%%mm4\n\t" \ |
|
179 "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
|
180 "punpckhwd %%mm5,%%mm1\n\t" \ |
|
181 "movq %%mm6,%%mm0\n\t" \ |
|
182 "punpcklwd %%mm7,%%mm6\n\t" \ |
|
183 "movq %%mm4,%%mm5\n\t" \ |
|
184 "punpckldq %%mm6,%%mm4\n\t" \ |
|
185 "punpckhdq %%mm6,%%mm5\n\t" \ |
|
186 "movq %%mm1,%%mm6\n\t" \ |
|
187 "movq %%mm4,"OC_J(4,_y)"\n\t" \ |
|
188 "punpckhwd %%mm7,%%mm0\n\t" \ |
|
189 "movq %%mm5,"OC_J(5,_y)"\n\t" \ |
|
190 "punpckhdq %%mm0,%%mm6\n\t" \ |
|
191 "movq "OC_I(0,_y)",%%mm4\n\t" \ |
|
192 "punpckldq %%mm0,%%mm1\n\t" \ |
|
193 "movq "OC_I(1,_y)",%%mm5\n\t" \ |
|
194 "movq %%mm4,%%mm0\n\t" \ |
|
195 "movq %%mm6,"OC_J(7,_y)"\n\t" \ |
|
196 "punpcklwd %%mm5,%%mm0\n\t" \ |
|
197 "movq %%mm1,"OC_J(6,_y)"\n\t" \ |
|
198 "punpckhwd %%mm5,%%mm4\n\t" \ |
|
199 "movq %%mm2,%%mm5\n\t" \ |
|
200 "punpcklwd %%mm3,%%mm2\n\t" \ |
|
201 "movq %%mm0,%%mm1\n\t" \ |
|
202 "punpckldq %%mm2,%%mm0\n\t" \ |
|
203 "punpckhdq %%mm2,%%mm1\n\t" \ |
|
204 "movq %%mm4,%%mm2\n\t" \ |
|
205 "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
|
206 "punpckhwd %%mm3,%%mm5\n\t" \ |
|
207 "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
|
208 "punpckhdq %%mm5,%%mm4\n\t" \ |
|
209 "punpckldq %%mm5,%%mm2\n\t" \ |
|
210 "movq %%mm4,"OC_I(3,_y)"\n\t" \ |
|
211 "movq %%mm2,"OC_I(2,_y)"\n\t" \ |
|
212 "#end OC_TRANSPOSE\n\t" \ |
|
213 |
|
214 /*38+19=57 cycles.*/ |
|
215 #define OC_COLUMN_IDCT(_y) \ |
|
216 "#OC_COLUMN_IDCT\n" \ |
|
217 OC_IDCT_BEGIN(_y,_y) \ |
|
218 "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ |
|
219 /*r1=H'+H'*/ \ |
|
220 "paddw %%mm1,%%mm1\n\t" \ |
|
221 /*r1=R1=A''+H'*/ \ |
|
222 "paddw %%mm2,%%mm1\n\t" \ |
|
223 /*r2=NR2*/ \ |
|
224 "psraw $4,%%mm2\n\t" \ |
|
225 /*r4=E'=E-G*/ \ |
|
226 "psubw %%mm7,%%mm4\n\t" \ |
|
227 /*r1=NR1*/ \ |
|
228 "psraw $4,%%mm1\n\t" \ |
|
229 /*r3=D'*/ \ |
|
230 "movq "OC_I(2,_y)",%%mm3\n\t" \ |
|
231 /*r7=G+G*/ \ |
|
232 "paddw %%mm7,%%mm7\n\t" \ |
|
233 /*Store NR2 at I(2).*/ \ |
|
234 "movq %%mm2,"OC_I(2,_y)"\n\t" \ |
|
235 /*r7=G'=E+G*/ \ |
|
236 "paddw %%mm4,%%mm7\n\t" \ |
|
237 /*Store NR1 at I(1).*/ \ |
|
238 "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
|
239 /*r4=R4=E'-D'*/ \ |
|
240 "psubw %%mm3,%%mm4\n\t" \ |
|
241 "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ |
|
242 /*r3=D'+D'*/ \ |
|
243 "paddw %%mm3,%%mm3\n\t" \ |
|
244 /*r3=R3=E'+D'*/ \ |
|
245 "paddw %%mm4,%%mm3\n\t" \ |
|
246 /*r4=NR4*/ \ |
|
247 "psraw $4,%%mm4\n\t" \ |
|
248 /*r6=R6=F'-B''*/ \ |
|
249 "psubw %%mm5,%%mm6\n\t" \ |
|
250 /*r3=NR3*/ \ |
|
251 "psraw $4,%%mm3\n\t" \ |
|
252 "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ |
|
253 /*r5=B''+B''*/ \ |
|
254 "paddw %%mm5,%%mm5\n\t" \ |
|
255 /*r5=R5=F'+B''*/ \ |
|
256 "paddw %%mm6,%%mm5\n\t" \ |
|
257 /*r6=NR6*/ \ |
|
258 "psraw $4,%%mm6\n\t" \ |
|
259 /*Store NR4 at J(4).*/ \ |
|
260 "movq %%mm4,"OC_J(4,_y)"\n\t" \ |
|
261 /*r5=NR5*/ \ |
|
262 "psraw $4,%%mm5\n\t" \ |
|
263 /*Store NR3 at I(3).*/ \ |
|
264 "movq %%mm3,"OC_I(3,_y)"\n\t" \ |
|
265 /*r7=R7=G'-C'*/ \ |
|
266 "psubw %%mm0,%%mm7\n\t" \ |
|
267 "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ |
|
268 /*r0=C'+C'*/ \ |
|
269 "paddw %%mm0,%%mm0\n\t" \ |
|
270 /*r0=R0=G'+C'*/ \ |
|
271 "paddw %%mm7,%%mm0\n\t" \ |
|
272 /*r7=NR7*/ \ |
|
273 "psraw $4,%%mm7\n\t" \ |
|
274 /*Store NR6 at J(6).*/ \ |
|
275 "movq %%mm6,"OC_J(6,_y)"\n\t" \ |
|
276 /*r0=NR0*/ \ |
|
277 "psraw $4,%%mm0\n\t" \ |
|
278 /*Store NR5 at J(5).*/ \ |
|
279 "movq %%mm5,"OC_J(5,_y)"\n\t" \ |
|
280 /*Store NR7 at J(7).*/ \ |
|
281 "movq %%mm7,"OC_J(7,_y)"\n\t" \ |
|
282 /*Store NR0 at I(0).*/ \ |
|
283 "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
|
284 "#end OC_COLUMN_IDCT\n\t" \ |
|
285 |
|
286 static void oc_idct8x8_slow_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
|
287 /*This routine accepts an 8x8 matrix, but in partially transposed form. |
|
288 Every 4x4 block is transposed.*/ |
|
289 __asm__ __volatile__( |
|
290 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
|
291 #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) |
|
292 OC_ROW_IDCT(y,x) |
|
293 OC_TRANSPOSE(y) |
|
294 #undef OC_I |
|
295 #undef OC_J |
|
296 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+64,_y) |
|
297 #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+72,_y) |
|
298 OC_ROW_IDCT(y,x) |
|
299 OC_TRANSPOSE(y) |
|
300 #undef OC_I |
|
301 #undef OC_J |
|
302 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
|
303 #define OC_J(_k,_y) OC_I(_k,_y) |
|
304 OC_COLUMN_IDCT(y) |
|
305 #undef OC_I |
|
306 #undef OC_J |
|
307 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) |
|
308 #define OC_J(_k,_y) OC_I(_k,_y) |
|
309 OC_COLUMN_IDCT(y) |
|
310 #undef OC_I |
|
311 #undef OC_J |
|
312 :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) |
|
313 :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
|
314 [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) |
|
315 ); |
|
316 if(_x!=_y){ |
|
317 int i; |
|
318 __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); |
|
319 for(i=0;i<4;i++){ |
|
320 __asm__ __volatile__( |
|
321 "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
|
322 "movq %%mm0,"OC_MEM_OFFS(0x08,x)"\n\t" |
|
323 "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
|
324 "movq %%mm0,"OC_MEM_OFFS(0x18,x)"\n\t" |
|
325 :[x]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_x+16*i,16) |
|
326 ); |
|
327 } |
|
328 } |
|
329 } |
|
330 |
|
331 /*25 cycles.*/ |
|
332 #define OC_IDCT_BEGIN_10(_y,_x) \ |
|
333 "#OC_IDCT_BEGIN_10\n\t" \ |
|
334 "movq "OC_I(3,_x)",%%mm2\n\t" \ |
|
335 "nop\n\t" \ |
|
336 "movq "OC_MEM_OFFS(0x30,c)",%%mm6\n\t" \ |
|
337 "movq %%mm2,%%mm4\n\t" \ |
|
338 "movq "OC_MEM_OFFS(0x50,c)",%%mm1\n\t" \ |
|
339 "pmulhw %%mm6,%%mm4\n\t" \ |
|
340 "movq "OC_I(1,_x)",%%mm3\n\t" \ |
|
341 "pmulhw %%mm2,%%mm1\n\t" \ |
|
342 "movq "OC_MEM_OFFS(0x10,c)",%%mm0\n\t" \ |
|
343 "paddw %%mm2,%%mm4\n\t" \ |
|
344 "pxor %%mm6,%%mm6\n\t" \ |
|
345 "paddw %%mm1,%%mm2\n\t" \ |
|
346 "movq "OC_I(2,_x)",%%mm5\n\t" \ |
|
347 "pmulhw %%mm3,%%mm0\n\t" \ |
|
348 "movq %%mm5,%%mm1\n\t" \ |
|
349 "paddw %%mm3,%%mm0\n\t" \ |
|
350 "pmulhw "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
|
351 "psubw %%mm2,%%mm6\n\t" \ |
|
352 "pmulhw "OC_MEM_OFFS(0x20,c)",%%mm5\n\t" \ |
|
353 "psubw %%mm4,%%mm0\n\t" \ |
|
354 "movq "OC_I(2,_x)",%%mm7\n\t" \ |
|
355 "paddw %%mm4,%%mm4\n\t" \ |
|
356 "paddw %%mm5,%%mm7\n\t" \ |
|
357 "paddw %%mm0,%%mm4\n\t" \ |
|
358 "pmulhw "OC_MEM_OFFS(0x60,c)",%%mm1\n\t" \ |
|
359 "psubw %%mm6,%%mm3\n\t" \ |
|
360 "movq %%mm4,"OC_I(1,_y)"\n\t" \ |
|
361 "paddw %%mm6,%%mm6\n\t" \ |
|
362 "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
|
363 "paddw %%mm3,%%mm6\n\t" \ |
|
364 "movq %%mm3,%%mm5\n\t" \ |
|
365 "pmulhw %%mm4,%%mm3\n\t" \ |
|
366 "movq %%mm6,"OC_I(2,_y)"\n\t" \ |
|
367 "movq %%mm0,%%mm2\n\t" \ |
|
368 "movq "OC_I(0,_x)",%%mm6\n\t" \ |
|
369 "pmulhw %%mm4,%%mm0\n\t" \ |
|
370 "paddw %%mm3,%%mm5\n\t" \ |
|
371 "paddw %%mm0,%%mm2\n\t" \ |
|
372 "psubw %%mm1,%%mm5\n\t" \ |
|
373 "pmulhw %%mm4,%%mm6\n\t" \ |
|
374 "paddw "OC_I(0,_x)",%%mm6\n\t" \ |
|
375 "paddw %%mm1,%%mm1\n\t" \ |
|
376 "movq %%mm6,%%mm4\n\t" \ |
|
377 "paddw %%mm5,%%mm1\n\t" \ |
|
378 "psubw %%mm2,%%mm6\n\t" \ |
|
379 "paddw %%mm2,%%mm2\n\t" \ |
|
380 "movq "OC_I(1,_y)",%%mm0\n\t" \ |
|
381 "paddw %%mm6,%%mm2\n\t" \ |
|
382 "psubw %%mm1,%%mm2\n\t" \ |
|
383 "nop\n\t" \ |
|
384 "#end OC_IDCT_BEGIN_10\n\t" \ |
|
385 |
|
386 /*25+8=33 cycles.*/ |
|
387 #define OC_ROW_IDCT_10(_y,_x) \ |
|
388 "#OC_ROW_IDCT_10\n\t" \ |
|
389 OC_IDCT_BEGIN_10(_y,_x) \ |
|
390 /*r3=D'*/ \ |
|
391 "movq "OC_I(2,_y)",%%mm3\n\t" \ |
|
392 /*r4=E'=E-G*/ \ |
|
393 "psubw %%mm7,%%mm4\n\t" \ |
|
394 /*r1=H'+H'*/ \ |
|
395 "paddw %%mm1,%%mm1\n\t" \ |
|
396 /*r7=G+G*/ \ |
|
397 "paddw %%mm7,%%mm7\n\t" \ |
|
398 /*r1=R1=A''+H'*/ \ |
|
399 "paddw %%mm2,%%mm1\n\t" \ |
|
400 /*r7=G'=E+G*/ \ |
|
401 "paddw %%mm4,%%mm7\n\t" \ |
|
402 /*r4=R4=E'-D'*/ \ |
|
403 "psubw %%mm3,%%mm4\n\t" \ |
|
404 "paddw %%mm3,%%mm3\n\t" \ |
|
405 /*r6=R6=F'-B''*/ \ |
|
406 "psubw %%mm5,%%mm6\n\t" \ |
|
407 "paddw %%mm5,%%mm5\n\t" \ |
|
408 /*r3=R3=E'+D'*/ \ |
|
409 "paddw %%mm4,%%mm3\n\t" \ |
|
410 /*r5=R5=F'+B''*/ \ |
|
411 "paddw %%mm6,%%mm5\n\t" \ |
|
412 /*r7=R7=G'-C'*/ \ |
|
413 "psubw %%mm0,%%mm7\n\t" \ |
|
414 "paddw %%mm0,%%mm0\n\t" \ |
|
415 /*Save R1.*/ \ |
|
416 "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
|
417 /*r0=R0=G'+C'*/ \ |
|
418 "paddw %%mm7,%%mm0\n\t" \ |
|
419 "#end OC_ROW_IDCT_10\n\t" \ |
|
420 |
|
421 /*25+19=44 cycles'*/ |
|
422 #define OC_COLUMN_IDCT_10(_y) \ |
|
423 "#OC_COLUMN_IDCT_10\n\t" \ |
|
424 OC_IDCT_BEGIN_10(_y,_y) \ |
|
425 "paddw "OC_MEM_OFFS(0x00,c)",%%mm2\n\t" \ |
|
426 /*r1=H'+H'*/ \ |
|
427 "paddw %%mm1,%%mm1\n\t" \ |
|
428 /*r1=R1=A''+H'*/ \ |
|
429 "paddw %%mm2,%%mm1\n\t" \ |
|
430 /*r2=NR2*/ \ |
|
431 "psraw $4,%%mm2\n\t" \ |
|
432 /*r4=E'=E-G*/ \ |
|
433 "psubw %%mm7,%%mm4\n\t" \ |
|
434 /*r1=NR1*/ \ |
|
435 "psraw $4,%%mm1\n\t" \ |
|
436 /*r3=D'*/ \ |
|
437 "movq "OC_I(2,_y)",%%mm3\n\t" \ |
|
438 /*r7=G+G*/ \ |
|
439 "paddw %%mm7,%%mm7\n\t" \ |
|
440 /*Store NR2 at I(2).*/ \ |
|
441 "movq %%mm2,"OC_I(2,_y)"\n\t" \ |
|
442 /*r7=G'=E+G*/ \ |
|
443 "paddw %%mm4,%%mm7\n\t" \ |
|
444 /*Store NR1 at I(1).*/ \ |
|
445 "movq %%mm1,"OC_I(1,_y)"\n\t" \ |
|
446 /*r4=R4=E'-D'*/ \ |
|
447 "psubw %%mm3,%%mm4\n\t" \ |
|
448 "paddw "OC_MEM_OFFS(0x00,c)",%%mm4\n\t" \ |
|
449 /*r3=D'+D'*/ \ |
|
450 "paddw %%mm3,%%mm3\n\t" \ |
|
451 /*r3=R3=E'+D'*/ \ |
|
452 "paddw %%mm4,%%mm3\n\t" \ |
|
453 /*r4=NR4*/ \ |
|
454 "psraw $4,%%mm4\n\t" \ |
|
455 /*r6=R6=F'-B''*/ \ |
|
456 "psubw %%mm5,%%mm6\n\t" \ |
|
457 /*r3=NR3*/ \ |
|
458 "psraw $4,%%mm3\n\t" \ |
|
459 "paddw "OC_MEM_OFFS(0x00,c)",%%mm6\n\t" \ |
|
460 /*r5=B''+B''*/ \ |
|
461 "paddw %%mm5,%%mm5\n\t" \ |
|
462 /*r5=R5=F'+B''*/ \ |
|
463 "paddw %%mm6,%%mm5\n\t" \ |
|
464 /*r6=NR6*/ \ |
|
465 "psraw $4,%%mm6\n\t" \ |
|
466 /*Store NR4 at J(4).*/ \ |
|
467 "movq %%mm4,"OC_J(4,_y)"\n\t" \ |
|
468 /*r5=NR5*/ \ |
|
469 "psraw $4,%%mm5\n\t" \ |
|
470 /*Store NR3 at I(3).*/ \ |
|
471 "movq %%mm3,"OC_I(3,_y)"\n\t" \ |
|
472 /*r7=R7=G'-C'*/ \ |
|
473 "psubw %%mm0,%%mm7\n\t" \ |
|
474 "paddw "OC_MEM_OFFS(0x00,c)",%%mm7\n\t" \ |
|
475 /*r0=C'+C'*/ \ |
|
476 "paddw %%mm0,%%mm0\n\t" \ |
|
477 /*r0=R0=G'+C'*/ \ |
|
478 "paddw %%mm7,%%mm0\n\t" \ |
|
479 /*r7=NR7*/ \ |
|
480 "psraw $4,%%mm7\n\t" \ |
|
481 /*Store NR6 at J(6).*/ \ |
|
482 "movq %%mm6,"OC_J(6,_y)"\n\t" \ |
|
483 /*r0=NR0*/ \ |
|
484 "psraw $4,%%mm0\n\t" \ |
|
485 /*Store NR5 at J(5).*/ \ |
|
486 "movq %%mm5,"OC_J(5,_y)"\n\t" \ |
|
487 /*Store NR7 at J(7).*/ \ |
|
488 "movq %%mm7,"OC_J(7,_y)"\n\t" \ |
|
489 /*Store NR0 at I(0).*/ \ |
|
490 "movq %%mm0,"OC_I(0,_y)"\n\t" \ |
|
491 "#end OC_COLUMN_IDCT_10\n\t" \ |
|
492 |
|
493 static void oc_idct8x8_10_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
|
494 __asm__ __volatile__( |
|
495 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
|
496 #define OC_J(_k,_y) OC_MEM_OFFS(((_k)-4)*16+8,_y) |
|
497 /*Done with dequant, descramble, and partial transpose. |
|
498 Now do the iDCT itself.*/ |
|
499 OC_ROW_IDCT_10(y,x) |
|
500 OC_TRANSPOSE(y) |
|
501 #undef OC_I |
|
502 #undef OC_J |
|
503 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16,_y) |
|
504 #define OC_J(_k,_y) OC_I(_k,_y) |
|
505 OC_COLUMN_IDCT_10(y) |
|
506 #undef OC_I |
|
507 #undef OC_J |
|
508 #define OC_I(_k,_y) OC_MEM_OFFS((_k)*16+8,_y) |
|
509 #define OC_J(_k,_y) OC_I(_k,_y) |
|
510 OC_COLUMN_IDCT_10(y) |
|
511 #undef OC_I |
|
512 #undef OC_J |
|
513 :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_y,64) |
|
514 :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
|
515 [c]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128) |
|
516 ); |
|
517 if(_x!=_y){ |
|
518 __asm__ __volatile__( |
|
519 "pxor %%mm0,%%mm0\n\t" |
|
520 "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
|
521 "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
|
522 "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
|
523 "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
|
524 :[x]"+m"OC_ARRAY_OPERAND(ogg_int16_t,_x,28) |
|
525 ); |
|
526 } |
|
527 } |
|
528 |
|
529 /*Performs an inverse 8x8 Type-II DCT transform. |
|
530 The input is assumed to be scaled by a factor of 4 relative to orthonormal |
|
531 version of the transform.*/ |
|
532 void oc_idct8x8_mmx(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
|
533 /*_last_zzi is subtly different from an actual count of the number of |
|
534 coefficients we decoded for this block. |
|
535 It contains the value of zzi BEFORE the final token in the block was |
|
536 decoded. |
|
537 In most cases this is an EOB token (the continuation of an EOB run from a |
|
538 previous block counts), and so this is the same as the coefficient count. |
|
539 However, in the case that the last token was NOT an EOB token, but filled |
|
540 the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
|
541 Provided the last token was not a pure zero run, the minimum value it can |
|
542 be is 46, and so that doesn't affect any of the cases in this routine. |
|
543 However, if the last token WAS a pure zero run of length 63, then _last_zzi |
|
544 will be 1 while the number of coefficients decoded is 64. |
|
545 Thus, we will trigger the following special case, where the real |
|
546 coefficient count would not. |
|
547 Note also that a zero run of length 64 will give _last_zzi a value of 0, |
|
548 but we still process the DC coefficient, which might have a non-zero value |
|
549 due to DC prediction. |
|
550 Although convoluted, this is arguably the correct behavior: it allows us to |
|
551 use a smaller transform when the block ends with a long zero run instead |
|
552 of a normal EOB token. |
|
553 It could be smarter... multiple separate zero runs at the end of a block |
|
554 will fool it, but an encoder that generates these really deserves what it |
|
555 gets. |
|
556 Needless to say we inherited this approach from VP3.*/ |
|
557 /*Then perform the iDCT.*/ |
|
558 if(_last_zzi<=10)oc_idct8x8_10_mmx(_y,_x); |
|
559 else oc_idct8x8_slow_mmx(_y,_x); |
|
560 } |
|
561 |
|
562 #endif |