|
1 ; |
|
2 ; jf3dnflt.asm - floating-point FDCT (3DNow!) |
|
3 ; |
|
4 ; Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB |
|
5 ; |
|
6 ; Based on |
|
7 ; x86 SIMD extension for IJG JPEG library |
|
8 ; Copyright (C) 1999-2006, MIYASAKA Masaru. |
|
9 ; For conditions of distribution and use, see copyright notice in jsimdext.inc |
|
10 ; |
|
11 ; This file should be assembled with NASM (Netwide Assembler), |
|
12 ; can *not* be assembled with Microsoft's MASM or any compatible |
|
13 ; assembler (including Borland's Turbo Assembler). |
|
14 ; NASM is available from http://nasm.sourceforge.net/ or |
|
15 ; http://sourceforge.net/project/showfiles.php?group_id=6208 |
|
16 ; |
|
17 ; This file contains a floating-point implementation of the forward DCT |
|
18 ; (Discrete Cosine Transform). The following code is based directly on |
|
19 ; the IJG's original jfdctflt.c; see the jfdctflt.c for more details. |
|
20 ; |
|
21 ; [TAB8] |
|
22 |
|
23 %include "jsimdext.inc" |
|
24 %include "jdct.inc" |
|
25 |
|
26 ; -------------------------------------------------------------------------- |
|
27 SECTION SEG_CONST |
|
28 |
|
29 alignz 16 |
|
30 global EXTN(jconst_fdct_float_3dnow) |
|
31 |
|
32 EXTN(jconst_fdct_float_3dnow): |
|
33 |
|
34 PD_0_382 times 2 dd 0.382683432365089771728460 |
|
35 PD_0_707 times 2 dd 0.707106781186547524400844 |
|
36 PD_0_541 times 2 dd 0.541196100146196984399723 |
|
37 PD_1_306 times 2 dd 1.306562964876376527856643 |
|
38 |
|
39 alignz 16 |
|
40 |
|
41 ; -------------------------------------------------------------------------- |
|
42 SECTION SEG_TEXT |
|
43 BITS 32 |
|
44 ; |
|
45 ; Perform the forward DCT on one block of samples. |
|
46 ; |
|
47 ; GLOBAL(void) |
|
48 ; jsimd_fdct_float_3dnow (FAST_FLOAT * data) |
|
49 ; |
|
50 |
|
51 %define data(b) (b)+8 ; FAST_FLOAT * data |
|
52 |
|
53 %define original_ebp ebp+0 |
|
54 %define wk(i) ebp-(WK_NUM-(i))*SIZEOF_MMWORD ; mmword wk[WK_NUM] |
|
55 %define WK_NUM 2 |
|
56 |
|
57 align 16 |
|
58 global EXTN(jsimd_fdct_float_3dnow) |
|
59 |
|
60 EXTN(jsimd_fdct_float_3dnow): |
|
61 push ebp |
|
62 mov eax,esp ; eax = original ebp |
|
63 sub esp, byte 4 |
|
64 and esp, byte (-SIZEOF_MMWORD) ; align to 64 bits |
|
65 mov [esp],eax |
|
66 mov ebp,esp ; ebp = aligned ebp |
|
67 lea esp, [wk(0)] |
|
68 pushpic ebx |
|
69 ; push ecx ; need not be preserved |
|
70 ; push edx ; need not be preserved |
|
71 ; push esi ; unused |
|
72 ; push edi ; unused |
|
73 |
|
74 get_GOT ebx ; get GOT address |
|
75 |
|
76 ; ---- Pass 1: process rows. |
|
77 |
|
78 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) |
|
79 mov ecx, DCTSIZE/2 |
|
80 alignx 16,7 |
|
81 .rowloop: |
|
82 |
|
83 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] |
|
84 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] |
|
85 movq mm2, MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)] |
|
86 movq mm3, MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)] |
|
87 |
|
88 ; mm0=(00 01), mm1=(10 11), mm2=(06 07), mm3=(16 17) |
|
89 |
|
90 movq mm4,mm0 ; transpose coefficients |
|
91 punpckldq mm0,mm1 ; mm0=(00 10)=data0 |
|
92 punpckhdq mm4,mm1 ; mm4=(01 11)=data1 |
|
93 movq mm5,mm2 ; transpose coefficients |
|
94 punpckldq mm2,mm3 ; mm2=(06 16)=data6 |
|
95 punpckhdq mm5,mm3 ; mm5=(07 17)=data7 |
|
96 |
|
97 movq mm6,mm4 |
|
98 movq mm7,mm0 |
|
99 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 |
|
100 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 |
|
101 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 |
|
102 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 |
|
103 |
|
104 movq mm1, MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)] |
|
105 movq mm3, MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)] |
|
106 movq mm2, MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)] |
|
107 movq mm5, MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)] |
|
108 |
|
109 ; mm1=(02 03), mm3=(12 13), mm2=(04 05), mm5=(14 15) |
|
110 |
|
111 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 |
|
112 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 |
|
113 |
|
114 movq mm4,mm1 ; transpose coefficients |
|
115 punpckldq mm1,mm3 ; mm1=(02 12)=data2 |
|
116 punpckhdq mm4,mm3 ; mm4=(03 13)=data3 |
|
117 movq mm0,mm2 ; transpose coefficients |
|
118 punpckldq mm2,mm5 ; mm2=(04 14)=data4 |
|
119 punpckhdq mm0,mm5 ; mm0=(05 15)=data5 |
|
120 |
|
121 movq mm3,mm4 |
|
122 movq mm5,mm1 |
|
123 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 |
|
124 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 |
|
125 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 |
|
126 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 |
|
127 |
|
128 ; -- Even part |
|
129 |
|
130 movq mm2,mm7 |
|
131 movq mm0,mm6 |
|
132 pfsub mm7,mm4 ; mm7=tmp13 |
|
133 pfsub mm6,mm1 ; mm6=tmp12 |
|
134 pfadd mm2,mm4 ; mm2=tmp10 |
|
135 pfadd mm0,mm1 ; mm0=tmp11 |
|
136 |
|
137 pfadd mm6,mm7 |
|
138 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 |
|
139 |
|
140 movq mm4,mm2 |
|
141 movq mm1,mm7 |
|
142 pfsub mm2,mm0 ; mm2=data4 |
|
143 pfsub mm7,mm6 ; mm7=data6 |
|
144 pfadd mm4,mm0 ; mm4=data0 |
|
145 pfadd mm1,mm6 ; mm1=data2 |
|
146 |
|
147 movq MMWORD [MMBLOCK(0,2,edx,SIZEOF_FAST_FLOAT)], mm2 |
|
148 movq MMWORD [MMBLOCK(0,3,edx,SIZEOF_FAST_FLOAT)], mm7 |
|
149 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 |
|
150 movq MMWORD [MMBLOCK(0,1,edx,SIZEOF_FAST_FLOAT)], mm1 |
|
151 |
|
152 ; -- Odd part |
|
153 |
|
154 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 |
|
155 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 |
|
156 |
|
157 pfadd mm3,mm5 ; mm3=tmp10 |
|
158 pfadd mm5,mm0 ; mm5=tmp11 |
|
159 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 |
|
160 |
|
161 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 |
|
162 |
|
163 movq mm2,mm3 ; mm2=tmp10 |
|
164 pfsub mm3,mm0 |
|
165 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 |
|
166 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) |
|
167 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) |
|
168 pfadd mm2,mm3 ; mm2=z2 |
|
169 pfadd mm0,mm3 ; mm0=z4 |
|
170 |
|
171 movq mm7,mm6 |
|
172 pfsub mm6,mm5 ; mm6=z13 |
|
173 pfadd mm7,mm5 ; mm7=z11 |
|
174 |
|
175 movq mm4,mm6 |
|
176 movq mm1,mm7 |
|
177 pfsub mm6,mm2 ; mm6=data3 |
|
178 pfsub mm7,mm0 ; mm7=data7 |
|
179 pfadd mm4,mm2 ; mm4=data5 |
|
180 pfadd mm1,mm0 ; mm1=data1 |
|
181 |
|
182 movq MMWORD [MMBLOCK(1,1,edx,SIZEOF_FAST_FLOAT)], mm6 |
|
183 movq MMWORD [MMBLOCK(1,3,edx,SIZEOF_FAST_FLOAT)], mm7 |
|
184 movq MMWORD [MMBLOCK(1,2,edx,SIZEOF_FAST_FLOAT)], mm4 |
|
185 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 |
|
186 |
|
187 add edx, byte 2*DCTSIZE*SIZEOF_FAST_FLOAT |
|
188 dec ecx |
|
189 jnz near .rowloop |
|
190 |
|
191 ; ---- Pass 2: process columns. |
|
192 |
|
193 mov edx, POINTER [data(eax)] ; (FAST_FLOAT *) |
|
194 mov ecx, DCTSIZE/2 |
|
195 alignx 16,7 |
|
196 .columnloop: |
|
197 |
|
198 movq mm0, MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)] |
|
199 movq mm1, MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)] |
|
200 movq mm2, MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)] |
|
201 movq mm3, MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)] |
|
202 |
|
203 ; mm0=(00 10), mm1=(01 11), mm2=(60 70), mm3=(61 71) |
|
204 |
|
205 movq mm4,mm0 ; transpose coefficients |
|
206 punpckldq mm0,mm1 ; mm0=(00 01)=data0 |
|
207 punpckhdq mm4,mm1 ; mm4=(10 11)=data1 |
|
208 movq mm5,mm2 ; transpose coefficients |
|
209 punpckldq mm2,mm3 ; mm2=(60 61)=data6 |
|
210 punpckhdq mm5,mm3 ; mm5=(70 71)=data7 |
|
211 |
|
212 movq mm6,mm4 |
|
213 movq mm7,mm0 |
|
214 pfsub mm4,mm2 ; mm4=data1-data6=tmp6 |
|
215 pfsub mm0,mm5 ; mm0=data0-data7=tmp7 |
|
216 pfadd mm6,mm2 ; mm6=data1+data6=tmp1 |
|
217 pfadd mm7,mm5 ; mm7=data0+data7=tmp0 |
|
218 |
|
219 movq mm1, MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)] |
|
220 movq mm3, MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)] |
|
221 movq mm2, MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)] |
|
222 movq mm5, MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)] |
|
223 |
|
224 ; mm1=(20 30), mm3=(21 31), mm2=(40 50), mm5=(41 51) |
|
225 |
|
226 movq MMWORD [wk(0)], mm4 ; wk(0)=tmp6 |
|
227 movq MMWORD [wk(1)], mm0 ; wk(1)=tmp7 |
|
228 |
|
229 movq mm4,mm1 ; transpose coefficients |
|
230 punpckldq mm1,mm3 ; mm1=(20 21)=data2 |
|
231 punpckhdq mm4,mm3 ; mm4=(30 31)=data3 |
|
232 movq mm0,mm2 ; transpose coefficients |
|
233 punpckldq mm2,mm5 ; mm2=(40 41)=data4 |
|
234 punpckhdq mm0,mm5 ; mm0=(50 51)=data5 |
|
235 |
|
236 movq mm3,mm4 |
|
237 movq mm5,mm1 |
|
238 pfadd mm4,mm2 ; mm4=data3+data4=tmp3 |
|
239 pfadd mm1,mm0 ; mm1=data2+data5=tmp2 |
|
240 pfsub mm3,mm2 ; mm3=data3-data4=tmp4 |
|
241 pfsub mm5,mm0 ; mm5=data2-data5=tmp5 |
|
242 |
|
243 ; -- Even part |
|
244 |
|
245 movq mm2,mm7 |
|
246 movq mm0,mm6 |
|
247 pfsub mm7,mm4 ; mm7=tmp13 |
|
248 pfsub mm6,mm1 ; mm6=tmp12 |
|
249 pfadd mm2,mm4 ; mm2=tmp10 |
|
250 pfadd mm0,mm1 ; mm0=tmp11 |
|
251 |
|
252 pfadd mm6,mm7 |
|
253 pfmul mm6,[GOTOFF(ebx,PD_0_707)] ; mm6=z1 |
|
254 |
|
255 movq mm4,mm2 |
|
256 movq mm1,mm7 |
|
257 pfsub mm2,mm0 ; mm2=data4 |
|
258 pfsub mm7,mm6 ; mm7=data6 |
|
259 pfadd mm4,mm0 ; mm4=data0 |
|
260 pfadd mm1,mm6 ; mm1=data2 |
|
261 |
|
262 movq MMWORD [MMBLOCK(4,0,edx,SIZEOF_FAST_FLOAT)], mm2 |
|
263 movq MMWORD [MMBLOCK(6,0,edx,SIZEOF_FAST_FLOAT)], mm7 |
|
264 movq MMWORD [MMBLOCK(0,0,edx,SIZEOF_FAST_FLOAT)], mm4 |
|
265 movq MMWORD [MMBLOCK(2,0,edx,SIZEOF_FAST_FLOAT)], mm1 |
|
266 |
|
267 ; -- Odd part |
|
268 |
|
269 movq mm0, MMWORD [wk(0)] ; mm0=tmp6 |
|
270 movq mm6, MMWORD [wk(1)] ; mm6=tmp7 |
|
271 |
|
272 pfadd mm3,mm5 ; mm3=tmp10 |
|
273 pfadd mm5,mm0 ; mm5=tmp11 |
|
274 pfadd mm0,mm6 ; mm0=tmp12, mm6=tmp7 |
|
275 |
|
276 pfmul mm5,[GOTOFF(ebx,PD_0_707)] ; mm5=z3 |
|
277 |
|
278 movq mm2,mm3 ; mm2=tmp10 |
|
279 pfsub mm3,mm0 |
|
280 pfmul mm3,[GOTOFF(ebx,PD_0_382)] ; mm3=z5 |
|
281 pfmul mm2,[GOTOFF(ebx,PD_0_541)] ; mm2=MULTIPLY(tmp10,FIX_0_54119610) |
|
282 pfmul mm0,[GOTOFF(ebx,PD_1_306)] ; mm0=MULTIPLY(tmp12,FIX_1_30656296) |
|
283 pfadd mm2,mm3 ; mm2=z2 |
|
284 pfadd mm0,mm3 ; mm0=z4 |
|
285 |
|
286 movq mm7,mm6 |
|
287 pfsub mm6,mm5 ; mm6=z13 |
|
288 pfadd mm7,mm5 ; mm7=z11 |
|
289 |
|
290 movq mm4,mm6 |
|
291 movq mm1,mm7 |
|
292 pfsub mm6,mm2 ; mm6=data3 |
|
293 pfsub mm7,mm0 ; mm7=data7 |
|
294 pfadd mm4,mm2 ; mm4=data5 |
|
295 pfadd mm1,mm0 ; mm1=data1 |
|
296 |
|
297 movq MMWORD [MMBLOCK(3,0,edx,SIZEOF_FAST_FLOAT)], mm6 |
|
298 movq MMWORD [MMBLOCK(7,0,edx,SIZEOF_FAST_FLOAT)], mm7 |
|
299 movq MMWORD [MMBLOCK(5,0,edx,SIZEOF_FAST_FLOAT)], mm4 |
|
300 movq MMWORD [MMBLOCK(1,0,edx,SIZEOF_FAST_FLOAT)], mm1 |
|
301 |
|
302 add edx, byte 2*SIZEOF_FAST_FLOAT |
|
303 dec ecx |
|
304 jnz near .columnloop |
|
305 |
|
306 femms ; empty MMX/3DNow! state |
|
307 |
|
308 ; pop edi ; unused |
|
309 ; pop esi ; unused |
|
310 ; pop edx ; need not be preserved |
|
311 ; pop ecx ; need not be preserved |
|
312 poppic ebx |
|
313 mov esp,ebp ; esp <- aligned ebp |
|
314 pop esp ; esp <- original ebp |
|
315 pop ebp |
|
316 ret |
|
317 |
|
318 ; For some reason, the OS X linker does not honor the request to align the |
|
319 ; segment unless we do this. |
|
320 align 16 |