|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: mmxidct.c 16503 2009-08-22 18:14:02Z giles $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 /*SSE2 acceleration of Theora's iDCT.*/ |
|
19 #include "x86int.h" |
|
20 #include "sse2trans.h" |
|
21 #include "../dct.h" |
|
22 |
|
23 #if defined(OC_X86_ASM) |
|
24 |
|
25 /*A table of constants used by the MMX routines.*/ |
|
26 const unsigned short __attribute__((aligned(16),used)) OC_IDCT_CONSTS[64]={ |
|
27 8, 8, 8, 8, 8, 8, 8, 8, |
|
28 OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7,OC_C1S7, |
|
29 OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6,OC_C2S6, |
|
30 OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5,OC_C3S5, |
|
31 OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4,OC_C4S4, |
|
32 OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3,OC_C5S3, |
|
33 OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2,OC_C6S2, |
|
34 OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1,OC_C7S1 |
|
35 }; |
|
36 |
|
37 |
|
38 /*Performs the first three stages of the iDCT. |
|
39 xmm2, xmm6, xmm3, and xmm5 must contain the corresponding rows of the input |
|
40 (accessed in that order). |
|
41 The remaining rows must be in _x at their corresponding locations. |
|
42 On output, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
|
43 contain rows 4 through 7.*/ |
|
44 #define OC_IDCT_8x8_ABC(_x) \ |
|
45 "#OC_IDCT_8x8_ABC\n\t" \ |
|
46 /*Stage 1:*/ \ |
|
47 /*2-3 rotation by 6pi/16. \ |
|
48 xmm4=xmm7=C6, xmm0=xmm1=C2, xmm2=X2, xmm6=X6.*/ \ |
|
49 "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm1\n\t" \ |
|
50 "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm4\n\t" \ |
|
51 "movdqa %%xmm1,%%xmm0\n\t" \ |
|
52 "pmulhw %%xmm2,%%xmm1\n\t" \ |
|
53 "movdqa %%xmm4,%%xmm7\n\t" \ |
|
54 "pmulhw %%xmm6,%%xmm0\n\t" \ |
|
55 "pmulhw %%xmm2,%%xmm7\n\t" \ |
|
56 "pmulhw %%xmm6,%%xmm4\n\t" \ |
|
57 "paddw %%xmm6,%%xmm0\n\t" \ |
|
58 "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm6\n\t" \ |
|
59 "paddw %%xmm1,%%xmm2\n\t" \ |
|
60 "psubw %%xmm0,%%xmm7\n\t" \ |
|
61 "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
|
62 "paddw %%xmm4,%%xmm2\n\t" \ |
|
63 "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm4\n\t" \ |
|
64 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
|
65 /*5-6 rotation by 3pi/16. \ |
|
66 xmm4=xmm2=C5, xmm1=xmm6=C3, xmm3=X3, xmm5=X5.*/ \ |
|
67 "movdqa %%xmm4,%%xmm2\n\t" \ |
|
68 "movdqa %%xmm6,%%xmm1\n\t" \ |
|
69 "pmulhw %%xmm3,%%xmm4\n\t" \ |
|
70 "pmulhw %%xmm5,%%xmm1\n\t" \ |
|
71 "pmulhw %%xmm3,%%xmm6\n\t" \ |
|
72 "pmulhw %%xmm5,%%xmm2\n\t" \ |
|
73 "paddw %%xmm3,%%xmm4\n\t" \ |
|
74 "paddw %%xmm5,%%xmm3\n\t" \ |
|
75 "paddw %%xmm6,%%xmm3\n\t" \ |
|
76 "movdqa "OC_MEM_OFFS(0x70,_x)",%%xmm6\n\t" \ |
|
77 "paddw %%xmm5,%%xmm1\n\t" \ |
|
78 "movdqa "OC_MEM_OFFS(0x10,_x)",%%xmm5\n\t" \ |
|
79 "paddw %%xmm3,%%xmm2\n\t" \ |
|
80 "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
|
81 "psubw %%xmm4,%%xmm1\n\t" \ |
|
82 "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm4\n\t" \ |
|
83 /*4-7 rotation by 7pi/16. \ |
|
84 xmm4=xmm7=C1, xmm3=xmm0=C7, xmm5=X1, xmm6=X7.*/ \ |
|
85 "movdqa %%xmm3,%%xmm0\n\t" \ |
|
86 "movdqa %%xmm4,%%xmm7\n\t" \ |
|
87 "pmulhw %%xmm5,%%xmm3\n\t" \ |
|
88 "pmulhw %%xmm5,%%xmm7\n\t" \ |
|
89 "pmulhw %%xmm6,%%xmm4\n\t" \ |
|
90 "pmulhw %%xmm6,%%xmm0\n\t" \ |
|
91 "paddw %%xmm6,%%xmm4\n\t" \ |
|
92 "movdqa "OC_MEM_OFFS(0x40,_x)",%%xmm6\n\t" \ |
|
93 "paddw %%xmm5,%%xmm7\n\t" \ |
|
94 "psubw %%xmm4,%%xmm3\n\t" \ |
|
95 "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
|
96 "paddw %%xmm7,%%xmm0\n\t" \ |
|
97 "movdqa "OC_MEM_OFFS(0x00,_x)",%%xmm7\n\t" \ |
|
98 /*0-1 butterfly. \ |
|
99 xmm4=xmm5=C4, xmm7=X0, xmm6=X4.*/ \ |
|
100 "paddw %%xmm7,%%xmm6\n\t" \ |
|
101 "movdqa %%xmm4,%%xmm5\n\t" \ |
|
102 "pmulhw %%xmm6,%%xmm4\n\t" \ |
|
103 "paddw %%xmm7,%%xmm7\n\t" \ |
|
104 "psubw %%xmm6,%%xmm7\n\t" \ |
|
105 "paddw %%xmm6,%%xmm4\n\t" \ |
|
106 /*Stage 2:*/ \ |
|
107 /*4-5 butterfly: xmm3=t[4], xmm1=t[5] \ |
|
108 7-6 butterfly: xmm2=t[6], xmm0=t[7]*/ \ |
|
109 "movdqa %%xmm3,%%xmm6\n\t" \ |
|
110 "paddw %%xmm1,%%xmm3\n\t" \ |
|
111 "psubw %%xmm1,%%xmm6\n\t" \ |
|
112 "movdqa %%xmm5,%%xmm1\n\t" \ |
|
113 "pmulhw %%xmm7,%%xmm5\n\t" \ |
|
114 "paddw %%xmm7,%%xmm5\n\t" \ |
|
115 "movdqa %%xmm0,%%xmm7\n\t" \ |
|
116 "paddw %%xmm2,%%xmm0\n\t" \ |
|
117 "psubw %%xmm2,%%xmm7\n\t" \ |
|
118 "movdqa %%xmm1,%%xmm2\n\t" \ |
|
119 "pmulhw %%xmm6,%%xmm1\n\t" \ |
|
120 "pmulhw %%xmm7,%%xmm2\n\t" \ |
|
121 "paddw %%xmm6,%%xmm1\n\t" \ |
|
122 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
|
123 "paddw %%xmm7,%%xmm2\n\t" \ |
|
124 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
|
125 /*Stage 3: \ |
|
126 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
|
127 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
|
128 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
|
129 "paddw %%xmm2,%%xmm1\n\t" \ |
|
130 "paddw %%xmm5,%%xmm6\n\t" \ |
|
131 "paddw %%xmm4,%%xmm7\n\t" \ |
|
132 "paddw %%xmm2,%%xmm2\n\t" \ |
|
133 "paddw %%xmm4,%%xmm4\n\t" \ |
|
134 "paddw %%xmm5,%%xmm5\n\t" \ |
|
135 "psubw %%xmm1,%%xmm2\n\t" \ |
|
136 "psubw %%xmm7,%%xmm4\n\t" \ |
|
137 "psubw %%xmm6,%%xmm5\n\t" \ |
|
138 |
|
139 /*Performs the last stage of the iDCT. |
|
140 On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
|
141 contain rows 4 through 7. |
|
142 On output, xmm0 through xmm7 contain the corresponding rows.*/ |
|
143 #define OC_IDCT_8x8_D \ |
|
144 "#OC_IDCT_8x8_D\n\t" \ |
|
145 /*Stage 4: \ |
|
146 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
|
147 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
|
148 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
|
149 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
|
150 "psubw %%xmm0,%%xmm7\n\t" \ |
|
151 "psubw %%xmm1,%%xmm6\n\t" \ |
|
152 "psubw %%xmm2,%%xmm5\n\t" \ |
|
153 "psubw %%xmm3,%%xmm4\n\t" \ |
|
154 "paddw %%xmm0,%%xmm0\n\t" \ |
|
155 "paddw %%xmm1,%%xmm1\n\t" \ |
|
156 "paddw %%xmm2,%%xmm2\n\t" \ |
|
157 "paddw %%xmm3,%%xmm3\n\t" \ |
|
158 "paddw %%xmm7,%%xmm0\n\t" \ |
|
159 "paddw %%xmm6,%%xmm1\n\t" \ |
|
160 "paddw %%xmm5,%%xmm2\n\t" \ |
|
161 "paddw %%xmm4,%%xmm3\n\t" \ |
|
162 |
|
163 /*Performs the last stage of the iDCT. |
|
164 On input, xmm7 down to xmm4 contain rows 0 through 3, and xmm0 up to xmm3 |
|
165 contain rows 4 through 7. |
|
166 On output, xmm0 through xmm7 contain the corresponding rows.*/ |
|
167 #define OC_IDCT_8x8_D_STORE \ |
|
168 "#OC_IDCT_8x8_D_STORE\n\t" \ |
|
169 /*Stage 4: \ |
|
170 0-7 butterfly: xmm7=t[0], xmm0=t[7] -> xmm0=t[0]+t[7], xmm7=t[0]-t[7] \ |
|
171 1-6 butterfly: xmm6=t[1], xmm1=t[6] -> xmm1=t[1]+t[6], xmm6=t[1]-t[6] \ |
|
172 2-5 butterfly: xmm5=t[2], xmm2=t[5] -> xmm2=t[2]+t[5], xmm5=t[2]-t[5] \ |
|
173 3-4 butterfly: xmm4=t[3], xmm3=t[4] -> xmm3=t[3]+t[4], xmm4=t[3]-t[4]*/ \ |
|
174 "psubw %%xmm3,%%xmm4\n\t" \ |
|
175 "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
|
176 "movdqa "OC_MEM_OFFS(0x00,c)",%%xmm4\n\t" \ |
|
177 "psubw %%xmm0,%%xmm7\n\t" \ |
|
178 "psubw %%xmm1,%%xmm6\n\t" \ |
|
179 "psubw %%xmm2,%%xmm5\n\t" \ |
|
180 "paddw %%xmm4,%%xmm7\n\t" \ |
|
181 "paddw %%xmm4,%%xmm6\n\t" \ |
|
182 "paddw %%xmm4,%%xmm5\n\t" \ |
|
183 "paddw "OC_MEM_OFFS(0x40,y)",%%xmm4\n\t" \ |
|
184 "paddw %%xmm0,%%xmm0\n\t" \ |
|
185 "paddw %%xmm1,%%xmm1\n\t" \ |
|
186 "paddw %%xmm2,%%xmm2\n\t" \ |
|
187 "paddw %%xmm3,%%xmm3\n\t" \ |
|
188 "paddw %%xmm7,%%xmm0\n\t" \ |
|
189 "paddw %%xmm6,%%xmm1\n\t" \ |
|
190 "psraw $4,%%xmm0\n\t" \ |
|
191 "paddw %%xmm5,%%xmm2\n\t" \ |
|
192 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" \ |
|
193 "psraw $4,%%xmm1\n\t" \ |
|
194 "paddw %%xmm4,%%xmm3\n\t" \ |
|
195 "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" \ |
|
196 "psraw $4,%%xmm2\n\t" \ |
|
197 "movdqa %%xmm2,"OC_MEM_OFFS(0x20,y)"\n\t" \ |
|
198 "psraw $4,%%xmm3\n\t" \ |
|
199 "movdqa %%xmm3,"OC_MEM_OFFS(0x30,y)"\n\t" \ |
|
200 "psraw $4,%%xmm4\n\t" \ |
|
201 "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" \ |
|
202 "psraw $4,%%xmm5\n\t" \ |
|
203 "movdqa %%xmm5,"OC_MEM_OFFS(0x50,y)"\n\t" \ |
|
204 "psraw $4,%%xmm6\n\t" \ |
|
205 "movdqa %%xmm6,"OC_MEM_OFFS(0x60,y)"\n\t" \ |
|
206 "psraw $4,%%xmm7\n\t" \ |
|
207 "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" \ |
|
208 |
|
209 static void oc_idct8x8_slow_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
|
210 OC_ALIGN16(ogg_int16_t buf[16]); |
|
211 /*This routine accepts an 8x8 matrix pre-transposed.*/ |
|
212 __asm__ __volatile__( |
|
213 /*Load rows 2, 3, 5, and 6 for the first stage of the iDCT.*/ |
|
214 "movdqa "OC_MEM_OFFS(0x20,x)",%%xmm2\n\t" |
|
215 "movdqa "OC_MEM_OFFS(0x60,x)",%%xmm6\n\t" |
|
216 "movdqa "OC_MEM_OFFS(0x30,x)",%%xmm3\n\t" |
|
217 "movdqa "OC_MEM_OFFS(0x50,x)",%%xmm5\n\t" |
|
218 OC_IDCT_8x8_ABC(x) |
|
219 OC_IDCT_8x8_D |
|
220 OC_TRANSPOSE_8x8 |
|
221 /*Clear out rows 0, 1, 4, and 7 for the first stage of the iDCT.*/ |
|
222 "movdqa %%xmm7,"OC_MEM_OFFS(0x70,y)"\n\t" |
|
223 "movdqa %%xmm4,"OC_MEM_OFFS(0x40,y)"\n\t" |
|
224 "movdqa %%xmm1,"OC_MEM_OFFS(0x10,y)"\n\t" |
|
225 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,y)"\n\t" |
|
226 OC_IDCT_8x8_ABC(y) |
|
227 OC_IDCT_8x8_D_STORE |
|
228 :[buf]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,buf,16)), |
|
229 [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
|
230 :[x]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64)), |
|
231 [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
|
232 ); |
|
233 if(_x!=_y){ |
|
234 int i; |
|
235 __asm__ __volatile__("pxor %%xmm0,%%xmm0\n\t"::); |
|
236 /*Clear input data for next block (decoder only).*/ |
|
237 for(i=0;i<2;i++){ |
|
238 __asm__ __volatile__( |
|
239 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
|
240 "movdqa %%xmm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
|
241 "movdqa %%xmm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
|
242 "movdqa %%xmm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
|
243 :[x]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_x+i*32,32)) |
|
244 ); |
|
245 } |
|
246 } |
|
247 } |
|
248 |
|
249 /*For the first step of the 10-coefficient version of the 8x8 iDCT, we only |
|
250 need to work with four columns at a time. |
|
251 Doing this in MMX is faster on processors with a 64-bit data path.*/ |
|
252 #define OC_IDCT_8x8_10_MMX \ |
|
253 "#OC_IDCT_8x8_10_MMX\n\t" \ |
|
254 /*Stage 1:*/ \ |
|
255 /*2-3 rotation by 6pi/16. \ |
|
256 mm7=C6, mm6=C2, mm2=X2, X6=0.*/ \ |
|
257 "movq "OC_MEM_OFFS(0x60,c)",%%mm7\n\t" \ |
|
258 "movq "OC_MEM_OFFS(0x20,c)",%%mm6\n\t" \ |
|
259 "pmulhw %%mm2,%%mm6\n\t" \ |
|
260 "pmulhw %%mm2,%%mm7\n\t" \ |
|
261 "movq "OC_MEM_OFFS(0x50,c)",%%mm5\n\t" \ |
|
262 "paddw %%mm6,%%mm2\n\t" \ |
|
263 "movq %%mm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
|
264 "movq "OC_MEM_OFFS(0x30,c)",%%mm2\n\t" \ |
|
265 "movq %%mm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
|
266 /*5-6 rotation by 3pi/16. \ |
|
267 mm5=C5, mm2=C3, mm3=X3, X5=0.*/ \ |
|
268 "pmulhw %%mm3,%%mm5\n\t" \ |
|
269 "pmulhw %%mm3,%%mm2\n\t" \ |
|
270 "movq "OC_MEM_OFFS(0x10,c)",%%mm7\n\t" \ |
|
271 "paddw %%mm3,%%mm5\n\t" \ |
|
272 "paddw %%mm3,%%mm2\n\t" \ |
|
273 "movq "OC_MEM_OFFS(0x70,c)",%%mm3\n\t" \ |
|
274 /*4-7 rotation by 7pi/16. \ |
|
275 mm7=C1, mm3=C7, mm1=X1, X7=0.*/ \ |
|
276 "pmulhw %%mm1,%%mm3\n\t" \ |
|
277 "pmulhw %%mm1,%%mm7\n\t" \ |
|
278 "movq "OC_MEM_OFFS(0x40,c)",%%mm4\n\t" \ |
|
279 "movq %%mm3,%%mm6\n\t" \ |
|
280 "paddw %%mm1,%%mm7\n\t" \ |
|
281 /*0-1 butterfly. \ |
|
282 mm4=C4, mm0=X0, X4=0.*/ \ |
|
283 /*Stage 2:*/ \ |
|
284 /*4-5 butterfly: mm3=t[4], mm5=t[5] \ |
|
285 7-6 butterfly: mm2=t[6], mm7=t[7]*/ \ |
|
286 "psubw %%mm5,%%mm3\n\t" \ |
|
287 "paddw %%mm5,%%mm6\n\t" \ |
|
288 "movq %%mm4,%%mm1\n\t" \ |
|
289 "pmulhw %%mm0,%%mm4\n\t" \ |
|
290 "paddw %%mm0,%%mm4\n\t" \ |
|
291 "movq %%mm7,%%mm0\n\t" \ |
|
292 "movq %%mm4,%%mm5\n\t" \ |
|
293 "paddw %%mm2,%%mm0\n\t" \ |
|
294 "psubw %%mm2,%%mm7\n\t" \ |
|
295 "movq %%mm1,%%mm2\n\t" \ |
|
296 "pmulhw %%mm6,%%mm1\n\t" \ |
|
297 "pmulhw %%mm7,%%mm2\n\t" \ |
|
298 "paddw %%mm6,%%mm1\n\t" \ |
|
299 "movq "OC_MEM_OFFS(0x00,buf)",%%mm6\n\t" \ |
|
300 "paddw %%mm7,%%mm2\n\t" \ |
|
301 "movq "OC_MEM_OFFS(0x10,buf)",%%mm7\n\t" \ |
|
302 /*Stage 3: \ |
|
303 6-5 butterfly: mm1=t[5], mm2=t[6] -> mm1=t[6]+t[5], mm2=t[6]-t[5] \ |
|
304 0-3 butterfly: mm4=t[0], mm7=t[3] -> mm7=t[0]+t[3], mm4=t[0]-t[3] \ |
|
305 1-2 butterfly: mm5=t[1], mm6=t[2] -> mm6=t[1]+t[2], mm5=t[1]-t[2]*/ \ |
|
306 "paddw %%mm2,%%mm1\n\t" \ |
|
307 "paddw %%mm5,%%mm6\n\t" \ |
|
308 "paddw %%mm4,%%mm7\n\t" \ |
|
309 "paddw %%mm2,%%mm2\n\t" \ |
|
310 "paddw %%mm4,%%mm4\n\t" \ |
|
311 "paddw %%mm5,%%mm5\n\t" \ |
|
312 "psubw %%mm1,%%mm2\n\t" \ |
|
313 "psubw %%mm7,%%mm4\n\t" \ |
|
314 "psubw %%mm6,%%mm5\n\t" \ |
|
315 /*Stage 4: \ |
|
316 0-7 butterfly: mm7=t[0], mm0=t[7] -> mm0=t[0]+t[7], mm7=t[0]-t[7] \ |
|
317 1-6 butterfly: mm6=t[1], mm1=t[6] -> mm1=t[1]+t[6], mm6=t[1]-t[6] \ |
|
318 2-5 butterfly: mm5=t[2], mm2=t[5] -> mm2=t[2]+t[5], mm5=t[2]-t[5] \ |
|
319 3-4 butterfly: mm4=t[3], mm3=t[4] -> mm3=t[3]+t[4], mm4=t[3]-t[4]*/ \ |
|
320 "psubw %%mm0,%%mm7\n\t" \ |
|
321 "psubw %%mm1,%%mm6\n\t" \ |
|
322 "psubw %%mm2,%%mm5\n\t" \ |
|
323 "psubw %%mm3,%%mm4\n\t" \ |
|
324 "paddw %%mm0,%%mm0\n\t" \ |
|
325 "paddw %%mm1,%%mm1\n\t" \ |
|
326 "paddw %%mm2,%%mm2\n\t" \ |
|
327 "paddw %%mm3,%%mm3\n\t" \ |
|
328 "paddw %%mm7,%%mm0\n\t" \ |
|
329 "paddw %%mm6,%%mm1\n\t" \ |
|
330 "paddw %%mm5,%%mm2\n\t" \ |
|
331 "paddw %%mm4,%%mm3\n\t" \ |
|
332 |
|
333 #define OC_IDCT_8x8_10_ABC \ |
|
334 "#OC_IDCT_8x8_10_ABC\n\t" \ |
|
335 /*Stage 1:*/ \ |
|
336 /*2-3 rotation by 6pi/16. \ |
|
337 xmm7=C6, xmm6=C2, xmm2=X2, X6=0.*/ \ |
|
338 "movdqa "OC_MEM_OFFS(0x60,c)",%%xmm7\n\t" \ |
|
339 "movdqa "OC_MEM_OFFS(0x20,c)",%%xmm6\n\t" \ |
|
340 "pmulhw %%xmm2,%%xmm6\n\t" \ |
|
341 "pmulhw %%xmm2,%%xmm7\n\t" \ |
|
342 "movdqa "OC_MEM_OFFS(0x50,c)",%%xmm5\n\t" \ |
|
343 "paddw %%xmm6,%%xmm2\n\t" \ |
|
344 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
|
345 "movdqa "OC_MEM_OFFS(0x30,c)",%%xmm2\n\t" \ |
|
346 "movdqa %%xmm7,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
|
347 /*5-6 rotation by 3pi/16. \ |
|
348 xmm5=C5, xmm2=C3, xmm3=X3, X5=0.*/ \ |
|
349 "pmulhw %%xmm3,%%xmm5\n\t" \ |
|
350 "pmulhw %%xmm3,%%xmm2\n\t" \ |
|
351 "movdqa "OC_MEM_OFFS(0x10,c)",%%xmm7\n\t" \ |
|
352 "paddw %%xmm3,%%xmm5\n\t" \ |
|
353 "paddw %%xmm3,%%xmm2\n\t" \ |
|
354 "movdqa "OC_MEM_OFFS(0x70,c)",%%xmm3\n\t" \ |
|
355 /*4-7 rotation by 7pi/16. \ |
|
356 xmm7=C1, xmm3=C7, xmm1=X1, X7=0.*/ \ |
|
357 "pmulhw %%xmm1,%%xmm3\n\t" \ |
|
358 "pmulhw %%xmm1,%%xmm7\n\t" \ |
|
359 "movdqa "OC_MEM_OFFS(0x40,c)",%%xmm4\n\t" \ |
|
360 "movdqa %%xmm3,%%xmm6\n\t" \ |
|
361 "paddw %%xmm1,%%xmm7\n\t" \ |
|
362 /*0-1 butterfly. \ |
|
363 xmm4=C4, xmm0=X0, X4=0.*/ \ |
|
364 /*Stage 2:*/ \ |
|
365 /*4-5 butterfly: xmm3=t[4], xmm5=t[5] \ |
|
366 7-6 butterfly: xmm2=t[6], xmm7=t[7]*/ \ |
|
367 "psubw %%xmm5,%%xmm3\n\t" \ |
|
368 "paddw %%xmm5,%%xmm6\n\t" \ |
|
369 "movdqa %%xmm4,%%xmm1\n\t" \ |
|
370 "pmulhw %%xmm0,%%xmm4\n\t" \ |
|
371 "paddw %%xmm0,%%xmm4\n\t" \ |
|
372 "movdqa %%xmm7,%%xmm0\n\t" \ |
|
373 "movdqa %%xmm4,%%xmm5\n\t" \ |
|
374 "paddw %%xmm2,%%xmm0\n\t" \ |
|
375 "psubw %%xmm2,%%xmm7\n\t" \ |
|
376 "movdqa %%xmm1,%%xmm2\n\t" \ |
|
377 "pmulhw %%xmm6,%%xmm1\n\t" \ |
|
378 "pmulhw %%xmm7,%%xmm2\n\t" \ |
|
379 "paddw %%xmm6,%%xmm1\n\t" \ |
|
380 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm6\n\t" \ |
|
381 "paddw %%xmm7,%%xmm2\n\t" \ |
|
382 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm7\n\t" \ |
|
383 /*Stage 3: \ |
|
384 6-5 butterfly: xmm1=t[5], xmm2=t[6] -> xmm1=t[6]+t[5], xmm2=t[6]-t[5] \ |
|
385 0-3 butterfly: xmm4=t[0], xmm7=t[3] -> xmm7=t[0]+t[3], xmm4=t[0]-t[3] \ |
|
386 1-2 butterfly: xmm5=t[1], xmm6=t[2] -> xmm6=t[1]+t[2], xmm5=t[1]-t[2]*/ \ |
|
387 "paddw %%xmm2,%%xmm1\n\t" \ |
|
388 "paddw %%xmm5,%%xmm6\n\t" \ |
|
389 "paddw %%xmm4,%%xmm7\n\t" \ |
|
390 "paddw %%xmm2,%%xmm2\n\t" \ |
|
391 "paddw %%xmm4,%%xmm4\n\t" \ |
|
392 "paddw %%xmm5,%%xmm5\n\t" \ |
|
393 "psubw %%xmm1,%%xmm2\n\t" \ |
|
394 "psubw %%xmm7,%%xmm4\n\t" \ |
|
395 "psubw %%xmm6,%%xmm5\n\t" \ |
|
396 |
|
397 static void oc_idct8x8_10_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64]){ |
|
398 OC_ALIGN16(ogg_int16_t buf[16]); |
|
399 /*This routine accepts an 8x8 matrix pre-transposed.*/ |
|
400 __asm__ __volatile__( |
|
401 "movq "OC_MEM_OFFS(0x20,x)",%%mm2\n\t" |
|
402 "movq "OC_MEM_OFFS(0x30,x)",%%mm3\n\t" |
|
403 "movq "OC_MEM_OFFS(0x10,x)",%%mm1\n\t" |
|
404 "movq "OC_MEM_OFFS(0x00,x)",%%mm0\n\t" |
|
405 OC_IDCT_8x8_10_MMX |
|
406 OC_TRANSPOSE_8x4_MMX2SSE |
|
407 OC_IDCT_8x8_10_ABC |
|
408 OC_IDCT_8x8_D_STORE |
|
409 :[buf]"=m"(OC_ARRAY_OPERAND(short,buf,16)), |
|
410 [y]"=m"(OC_ARRAY_OPERAND(ogg_int16_t,_y,64)) |
|
411 :[x]"m"OC_CONST_ARRAY_OPERAND(ogg_int16_t,_x,64), |
|
412 [c]"m"(OC_CONST_ARRAY_OPERAND(ogg_int16_t,OC_IDCT_CONSTS,128)) |
|
413 ); |
|
414 if(_x!=_y){ |
|
415 /*Clear input data for next block (decoder only).*/ |
|
416 __asm__ __volatile__( |
|
417 "pxor %%mm0,%%mm0\n\t" |
|
418 "movq %%mm0,"OC_MEM_OFFS(0x00,x)"\n\t" |
|
419 "movq %%mm0,"OC_MEM_OFFS(0x10,x)"\n\t" |
|
420 "movq %%mm0,"OC_MEM_OFFS(0x20,x)"\n\t" |
|
421 "movq %%mm0,"OC_MEM_OFFS(0x30,x)"\n\t" |
|
422 :[x]"+m"(OC_ARRAY_OPERAND(ogg_int16_t,_x,28)) |
|
423 ); |
|
424 } |
|
425 } |
|
426 |
|
427 /*Performs an inverse 8x8 Type-II DCT transform. |
|
428 The input is assumed to be scaled by a factor of 4 relative to orthonormal |
|
429 version of the transform.*/ |
|
430 void oc_idct8x8_sse2(ogg_int16_t _y[64],ogg_int16_t _x[64],int _last_zzi){ |
|
431 /*_last_zzi is subtly different from an actual count of the number of |
|
432 coefficients we decoded for this block. |
|
433 It contains the value of zzi BEFORE the final token in the block was |
|
434 decoded. |
|
435 In most cases this is an EOB token (the continuation of an EOB run from a |
|
436 previous block counts), and so this is the same as the coefficient count. |
|
437 However, in the case that the last token was NOT an EOB token, but filled |
|
438 the block up with exactly 64 coefficients, _last_zzi will be less than 64. |
|
439 Provided the last token was not a pure zero run, the minimum value it can |
|
440 be is 46, and so that doesn't affect any of the cases in this routine. |
|
441 However, if the last token WAS a pure zero run of length 63, then _last_zzi |
|
442 will be 1 while the number of coefficients decoded is 64. |
|
443 Thus, we will trigger the following special case, where the real |
|
444 coefficient count would not. |
|
445 Note also that a zero run of length 64 will give _last_zzi a value of 0, |
|
446 but we still process the DC coefficient, which might have a non-zero value |
|
447 due to DC prediction. |
|
448 Although convoluted, this is arguably the correct behavior: it allows us to |
|
449 use a smaller transform when the block ends with a long zero run instead |
|
450 of a normal EOB token. |
|
451 It could be smarter... multiple separate zero runs at the end of a block |
|
452 will fool it, but an encoder that generates these really deserves what it |
|
453 gets. |
|
454 Needless to say we inherited this approach from VP3.*/ |
|
455 /*Then perform the iDCT.*/ |
|
456 if(_last_zzi<=10)oc_idct8x8_10_sse2(_y,_x); |
|
457 else oc_idct8x8_slow_sse2(_y,_x); |
|
458 } |
|
459 |
|
460 #endif |