|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: sse2trans.h 15675 2009-02-06 09:43:27Z tterribe $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 #if !defined(_x86_sse2trans_H) |
|
19 # define _x86_sse2trans_H (1) |
|
20 # include "x86int.h" |
|
21 |
|
22 # if defined(OC_X86_64_ASM) |
|
23 /*On x86-64 we can transpose in-place without spilling registers. |
|
24 By clever choices of the order to apply the butterflies and the order of |
|
25 their outputs, we can take the rows in order and output the columns in order |
|
26 without any extra operations and using just one temporary register.*/ |
|
27 # define OC_TRANSPOSE_8x8 \ |
|
28 "#OC_TRANSPOSE_8x8\n\t" \ |
|
29 "movdqa %%xmm4,%%xmm8\n\t" \ |
|
30 /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
|
31 "punpcklwd %%xmm5,%%xmm4\n\t" \ |
|
32 /*xmm8 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
|
33 "punpckhwd %%xmm5,%%xmm8\n\t" \ |
|
34 /*xmm5 is free.*/ \ |
|
35 "movdqa %%xmm0,%%xmm5\n\t" \ |
|
36 /*xmm0 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
|
37 "punpcklwd %%xmm1,%%xmm0\n\t" \ |
|
38 /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
|
39 "punpckhwd %%xmm1,%%xmm5\n\t" \ |
|
40 /*xmm1 is free.*/ \ |
|
41 "movdqa %%xmm6,%%xmm1\n\t" \ |
|
42 /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
|
43 "punpcklwd %%xmm7,%%xmm6\n\t" \ |
|
44 /*xmm1 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
|
45 "punpckhwd %%xmm7,%%xmm1\n\t" \ |
|
46 /*xmm7 is free.*/ \ |
|
47 "movdqa %%xmm2,%%xmm7\n\t" \ |
|
48 /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
|
49 "punpckhwd %%xmm3,%%xmm2\n\t" \ |
|
50 /*xmm7 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
|
51 "punpcklwd %%xmm3,%%xmm7\n\t" \ |
|
52 /*xmm3 is free.*/ \ |
|
53 "movdqa %%xmm0,%%xmm3\n\t" \ |
|
54 /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
|
55 "punpckldq %%xmm7,%%xmm0\n\t" \ |
|
56 /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
|
57 "punpckhdq %%xmm7,%%xmm3\n\t" \ |
|
58 /*xmm7 is free.*/ \ |
|
59 "movdqa %%xmm5,%%xmm7\n\t" \ |
|
60 /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
|
61 "punpckldq %%xmm2,%%xmm5\n\t" \ |
|
62 /*xmm7 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
|
63 "punpckhdq %%xmm2,%%xmm7\n\t" \ |
|
64 /*xmm2 is free.*/ \ |
|
65 "movdqa %%xmm4,%%xmm2\n\t" \ |
|
66 /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
|
67 "punpckhdq %%xmm6,%%xmm4\n\t" \ |
|
68 /*xmm2 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
|
69 "punpckldq %%xmm6,%%xmm2\n\t" \ |
|
70 /*xmm6 is free.*/ \ |
|
71 "movdqa %%xmm8,%%xmm6\n\t" \ |
|
72 /*xmm6 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
|
73 "punpckldq %%xmm1,%%xmm6\n\t" \ |
|
74 /*xmm8 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
|
75 "punpckhdq %%xmm1,%%xmm8\n\t" \ |
|
76 /*xmm1 is free.*/ \ |
|
77 "movdqa %%xmm0,%%xmm1\n\t" \ |
|
78 /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
|
79 "punpcklqdq %%xmm2,%%xmm0\n\t" \ |
|
80 /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
|
81 "punpckhqdq %%xmm2,%%xmm1\n\t" \ |
|
82 /*xmm2 is free.*/ \ |
|
83 "movdqa %%xmm3,%%xmm2\n\t" \ |
|
84 /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
|
85 "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
|
86 /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
|
87 "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
|
88 /*xmm4 is free.*/ \ |
|
89 "movdqa %%xmm5,%%xmm4\n\t" \ |
|
90 /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
|
91 "punpckhqdq %%xmm6,%%xmm5\n\t" \ |
|
92 /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
|
93 "punpcklqdq %%xmm6,%%xmm4\n\t" \ |
|
94 /*xmm6 is free.*/ \ |
|
95 "movdqa %%xmm7,%%xmm6\n\t" \ |
|
96 /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
|
97 "punpckhqdq %%xmm8,%%xmm7\n\t" \ |
|
98 /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
|
99 "punpcklqdq %%xmm8,%%xmm6\n\t" \ |
|
100 /*xmm8 is free.*/ \ |
|
101 |
|
102 # else |
|
103 /*Otherwise, we need to spill some values to %[buf] temporarily. |
|
104 Again, the butterflies are carefully arranged to get the columns to come out |
|
105 in order, minimizing register spills and maximizing the delay between a load |
|
106 and when the value loaded is actually used.*/ |
|
107 # define OC_TRANSPOSE_8x8 \ |
|
108 "#OC_TRANSPOSE_8x8\n\t" \ |
|
109 /*buf[0] = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
|
110 "movdqa %%xmm0,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
|
111 /*xmm0 is free.*/ \ |
|
112 "movdqa %%xmm2,%%xmm0\n\t" \ |
|
113 /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
|
114 "punpckhwd %%xmm3,%%xmm2\n\t" \ |
|
115 /*xmm0 = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
|
116 "punpcklwd %%xmm3,%%xmm0\n\t" \ |
|
117 /*xmm3 = a7 a6 a5 a4 a3 a2 a1 a0*/ \ |
|
118 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm3\n\t" \ |
|
119 /*buf[1] = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
|
120 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
|
121 /*xmm2 is free.*/ \ |
|
122 "movdqa %%xmm6,%%xmm2\n\t" \ |
|
123 /*xmm6 = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
|
124 "punpcklwd %%xmm7,%%xmm6\n\t" \ |
|
125 /*xmm2 = h7 g7 h6 g6 h5 g5 h4 g4*/ \ |
|
126 "punpckhwd %%xmm7,%%xmm2\n\t" \ |
|
127 /*xmm7 is free.*/ \ |
|
128 "movdqa %%xmm4,%%xmm7\n\t" \ |
|
129 /*xmm4 = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
|
130 "punpcklwd %%xmm5,%%xmm4\n\t" \ |
|
131 /*xmm7 = f7 e7 f6 e6 f5 e5 f4 e4*/ \ |
|
132 "punpckhwd %%xmm5,%%xmm7\n\t" \ |
|
133 /*xmm5 is free.*/ \ |
|
134 "movdqa %%xmm3,%%xmm5\n\t" \ |
|
135 /*xmm3 = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
|
136 "punpcklwd %%xmm1,%%xmm3\n\t" \ |
|
137 /*xmm5 = b7 a7 b6 a6 b5 a5 b4 a4*/ \ |
|
138 "punpckhwd %%xmm1,%%xmm5\n\t" \ |
|
139 /*xmm1 is free.*/ \ |
|
140 "movdqa %%xmm7,%%xmm1\n\t" \ |
|
141 /*xmm7 = h5 g5 f5 e5 h4 g4 f4 e4*/ \ |
|
142 "punpckldq %%xmm2,%%xmm7\n\t" \ |
|
143 /*xmm1 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
|
144 "punpckhdq %%xmm2,%%xmm1\n\t" \ |
|
145 /*xmm2 = d7 c7 d6 c6 d5 c5 d4 c4*/ \ |
|
146 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm2\n\t" \ |
|
147 /*buf[0] = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
|
148 "movdqa %%xmm1,"OC_MEM_OFFS(0x00,buf)"\n\t" \ |
|
149 /*xmm1 is free.*/ \ |
|
150 "movdqa %%xmm3,%%xmm1\n\t" \ |
|
151 /*xmm3 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
|
152 "punpckhdq %%xmm0,%%xmm3\n\t" \ |
|
153 /*xmm1 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
|
154 "punpckldq %%xmm0,%%xmm1\n\t" \ |
|
155 /*xmm0 is free.*/ \ |
|
156 "movdqa %%xmm4,%%xmm0\n\t" \ |
|
157 /*xmm4 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
|
158 "punpckhdq %%xmm6,%%xmm4\n\t" \ |
|
159 /*xmm0 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
|
160 "punpckldq %%xmm6,%%xmm0\n\t" \ |
|
161 /*xmm6 is free.*/ \ |
|
162 "movdqa %%xmm5,%%xmm6\n\t" \ |
|
163 /*xmm5 = d5 c5 b5 a5 d4 c4 b4 a4*/ \ |
|
164 "punpckldq %%xmm2,%%xmm5\n\t" \ |
|
165 /*xmm6 = d7 c7 b7 a7 d6 c6 b6 a6*/ \ |
|
166 "punpckhdq %%xmm2,%%xmm6\n\t" \ |
|
167 /*xmm2 is free.*/ \ |
|
168 "movdqa %%xmm1,%%xmm2\n\t" \ |
|
169 /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
|
170 "punpckhqdq %%xmm0,%%xmm1\n\t" \ |
|
171 /*xmm2 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
|
172 "punpcklqdq %%xmm0,%%xmm2\n\t" \ |
|
173 /*xmm0 = h7 g7 f7 e7 h6 g6 f6 e6*/ \ |
|
174 "movdqa "OC_MEM_OFFS(0x00,buf)",%%xmm0\n\t" \ |
|
175 /*buf[1] = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
|
176 "movdqa %%xmm2,"OC_MEM_OFFS(0x10,buf)"\n\t" \ |
|
177 /*xmm2 is free.*/ \ |
|
178 "movdqa %%xmm3,%%xmm2\n\t" \ |
|
179 /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
|
180 "punpckhqdq %%xmm4,%%xmm3\n\t" \ |
|
181 /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
|
182 "punpcklqdq %%xmm4,%%xmm2\n\t" \ |
|
183 /*xmm4 is free.*/ \ |
|
184 "movdqa %%xmm5,%%xmm4\n\t" \ |
|
185 /*xmm5 = h5 g5 f5 e5 d5 c5 b5 a5*/ \ |
|
186 "punpckhqdq %%xmm7,%%xmm5\n\t" \ |
|
187 /*xmm4 = h4 g4 f4 e4 d4 c4 b4 a4*/ \ |
|
188 "punpcklqdq %%xmm7,%%xmm4\n\t" \ |
|
189 /*xmm7 is free.*/ \ |
|
190 "movdqa %%xmm6,%%xmm7\n\t" \ |
|
191 /*xmm6 = h6 g6 f6 e6 d6 c6 b6 a6*/ \ |
|
192 "punpcklqdq %%xmm0,%%xmm6\n\t" \ |
|
193 /*xmm7 = h7 g7 f7 e7 d7 c7 b7 a7*/ \ |
|
194 "punpckhqdq %%xmm0,%%xmm7\n\t" \ |
|
195 /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
|
196 "movdqa "OC_MEM_OFFS(0x10,buf)",%%xmm0\n\t" \ |
|
197 |
|
198 # endif |
|
199 |
|
200 /*Transpose 4 values in each of 8 MMX registers into 8 values in the first |
|
201 four SSE registers. |
|
202 No need to be clever here; we have plenty of room.*/ |
|
203 # define OC_TRANSPOSE_8x4_MMX2SSE \ |
|
204 "#OC_TRANSPOSE_8x4_MMX2SSE\n\t" \ |
|
205 "movq2dq %%mm0,%%xmm0\n\t" \ |
|
206 "movq2dq %%mm1,%%xmm1\n\t" \ |
|
207 /*xmmA = b3 a3 b2 a2 b1 a1 b0 a0*/ \ |
|
208 "punpcklwd %%xmm1,%%xmm0\n\t" \ |
|
209 "movq2dq %%mm2,%%xmm3\n\t" \ |
|
210 "movq2dq %%mm3,%%xmm2\n\t" \ |
|
211 /*xmmC = d3 c3 d2 c2 d1 c1 d0 c0*/ \ |
|
212 "punpcklwd %%xmm2,%%xmm3\n\t" \ |
|
213 "movq2dq %%mm4,%%xmm4\n\t" \ |
|
214 "movq2dq %%mm5,%%xmm5\n\t" \ |
|
215 /*xmmE = f3 e3 f2 e2 f1 e1 f0 e0*/ \ |
|
216 "punpcklwd %%xmm5,%%xmm4\n\t" \ |
|
217 "movq2dq %%mm6,%%xmm7\n\t" \ |
|
218 "movq2dq %%mm7,%%xmm6\n\t" \ |
|
219 /*xmmG = h3 g3 h2 g2 h1 g1 h0 g0*/ \ |
|
220 "punpcklwd %%xmm6,%%xmm7\n\t" \ |
|
221 "movdqa %%xmm0,%%xmm2\n\t" \ |
|
222 /*xmm0 = d1 c1 b1 a1 d0 c0 b0 a0*/ \ |
|
223 "punpckldq %%xmm3,%%xmm0\n\t" \ |
|
224 /*xmm2 = d3 c3 b3 a3 d2 c2 b2 a2*/ \ |
|
225 "punpckhdq %%xmm3,%%xmm2\n\t" \ |
|
226 "movdqa %%xmm4,%%xmm5\n\t" \ |
|
227 /*xmm4 = h1 g1 f1 e1 h0 g0 f0 e0*/ \ |
|
228 "punpckldq %%xmm7,%%xmm4\n\t" \ |
|
229 /*xmm3 = h3 g3 f3 e3 h2 g2 f2 e2*/ \ |
|
230 "punpckhdq %%xmm7,%%xmm5\n\t" \ |
|
231 "movdqa %%xmm0,%%xmm1\n\t" \ |
|
232 /*xmm0 = h0 g0 f0 e0 d0 c0 b0 a0*/ \ |
|
233 "punpcklqdq %%xmm4,%%xmm0\n\t" \ |
|
234 /*xmm1 = h1 g1 f1 e1 d1 c1 b1 a1*/ \ |
|
235 "punpckhqdq %%xmm4,%%xmm1\n\t" \ |
|
236 "movdqa %%xmm2,%%xmm3\n\t" \ |
|
237 /*xmm2 = h2 g2 f2 e2 d2 c2 b2 a2*/ \ |
|
238 "punpcklqdq %%xmm5,%%xmm2\n\t" \ |
|
239 /*xmm3 = h3 g3 f3 e3 d3 c3 b3 a3*/ \ |
|
240 "punpckhqdq %%xmm5,%%xmm3\n\t" \ |
|
241 |
|
242 #endif |