|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: mmxfrag.c 17446 2010-09-23 20:06:20Z tterribe $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 /*MMX acceleration of fragment reconstruction for motion compensation. |
|
19 Originally written by Rudolf Marek. |
|
20 Additional optimization by Nils Pipenbrinck. |
|
21 Note: Loops are unrolled for best performance. |
|
22 The iteration each instruction belongs to is marked in the comments as #i.*/ |
|
23 #include <stddef.h> |
|
24 #include "x86int.h" |
|
25 |
|
26 #if defined(OC_X86_ASM) |
|
27 |
|
28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
|
29 between rows.*/ |
|
30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ |
|
31 do{ \ |
|
32 const unsigned char *src; \ |
|
33 unsigned char *dst; \ |
|
34 src=(_src); \ |
|
35 dst=(_dst); \ |
|
36 __asm mov SRC,src \ |
|
37 __asm mov DST,dst \ |
|
38 __asm mov YSTRIDE,_ystride \ |
|
39 /*src+0*ystride*/ \ |
|
40 __asm movq mm0,[SRC] \ |
|
41 /*src+1*ystride*/ \ |
|
42 __asm movq mm1,[SRC+YSTRIDE] \ |
|
43 /*ystride3=ystride*3*/ \ |
|
44 __asm lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] \ |
|
45 /*src+2*ystride*/ \ |
|
46 __asm movq mm2,[SRC+YSTRIDE*2] \ |
|
47 /*src+3*ystride*/ \ |
|
48 __asm movq mm3,[SRC+YSTRIDE3] \ |
|
49 /*dst+0*ystride*/ \ |
|
50 __asm movq [DST],mm0 \ |
|
51 /*dst+1*ystride*/ \ |
|
52 __asm movq [DST+YSTRIDE],mm1 \ |
|
53 /*Pointer to next 4.*/ \ |
|
54 __asm lea SRC,[SRC+YSTRIDE*4] \ |
|
55 /*dst+2*ystride*/ \ |
|
56 __asm movq [DST+YSTRIDE*2],mm2 \ |
|
57 /*dst+3*ystride*/ \ |
|
58 __asm movq [DST+YSTRIDE3],mm3 \ |
|
59 /*Pointer to next 4.*/ \ |
|
60 __asm lea DST,[DST+YSTRIDE*4] \ |
|
61 /*src+0*ystride*/ \ |
|
62 __asm movq mm0,[SRC] \ |
|
63 /*src+1*ystride*/ \ |
|
64 __asm movq mm1,[SRC+YSTRIDE] \ |
|
65 /*src+2*ystride*/ \ |
|
66 __asm movq mm2,[SRC+YSTRIDE*2] \ |
|
67 /*src+3*ystride*/ \ |
|
68 __asm movq mm3,[SRC+YSTRIDE3] \ |
|
69 /*dst+0*ystride*/ \ |
|
70 __asm movq [DST],mm0 \ |
|
71 /*dst+1*ystride*/ \ |
|
72 __asm movq [DST+YSTRIDE],mm1 \ |
|
73 /*dst+2*ystride*/ \ |
|
74 __asm movq [DST+YSTRIDE*2],mm2 \ |
|
75 /*dst+3*ystride*/ \ |
|
76 __asm movq [DST+YSTRIDE3],mm3 \ |
|
77 } \ |
|
78 while(0) |
|
79 |
|
80 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
|
81 between rows.*/ |
|
82 void oc_frag_copy_mmx(unsigned char *_dst, |
|
83 const unsigned char *_src,int _ystride){ |
|
84 #define SRC edx |
|
85 #define DST eax |
|
86 #define YSTRIDE ecx |
|
87 #define YSTRIDE3 esi |
|
88 OC_FRAG_COPY_MMX(_dst,_src,_ystride); |
|
89 #undef SRC |
|
90 #undef DST |
|
91 #undef YSTRIDE |
|
92 #undef YSTRIDE3 |
|
93 } |
|
94 |
|
95 /*Copies the fragments specified by the lists of fragment indices from one |
|
96 frame to another. |
|
97 _dst_frame: The reference frame to copy to. |
|
98 _src_frame: The reference frame to copy from. |
|
99 _ystride: The row stride of the reference frames. |
|
100 _fragis: A pointer to a list of fragment indices. |
|
101 _nfragis: The number of fragment indices to copy. |
|
102 _frag_buf_offs: The offsets of fragments in the reference frames.*/ |
|
103 void oc_frag_copy_list_mmx(unsigned char *_dst_frame, |
|
104 const unsigned char *_src_frame,int _ystride, |
|
105 const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ |
|
106 ptrdiff_t fragii; |
|
107 for(fragii=0;fragii<_nfragis;fragii++){ |
|
108 ptrdiff_t frag_buf_off; |
|
109 frag_buf_off=_frag_buf_offs[_fragis[fragii]]; |
|
110 #define SRC edx |
|
111 #define DST eax |
|
112 #define YSTRIDE ecx |
|
113 #define YSTRIDE3 edi |
|
114 OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, |
|
115 _src_frame+frag_buf_off,_ystride); |
|
116 #undef SRC |
|
117 #undef DST |
|
118 #undef YSTRIDE |
|
119 #undef YSTRIDE3 |
|
120 } |
|
121 } |
|
122 |
|
123 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, |
|
124 const ogg_int16_t *_residue){ |
|
125 __asm{ |
|
126 #define DST edx |
|
127 #define DST4 esi |
|
128 #define YSTRIDE eax |
|
129 #define YSTRIDE3 edi |
|
130 #define RESIDUE ecx |
|
131 mov DST,_dst |
|
132 mov YSTRIDE,_ystride |
|
133 mov RESIDUE,_residue |
|
134 lea DST4,[DST+YSTRIDE*4] |
|
135 lea YSTRIDE3,[YSTRIDE+YSTRIDE*2] |
|
136 /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ |
|
137 pcmpeqw mm0,mm0 |
|
138 /*#0 Load low residue.*/ |
|
139 movq mm1,[0*8+RESIDUE] |
|
140 /*#0 Load high residue.*/ |
|
141 movq mm2,[1*8+RESIDUE] |
|
142 /*Set mm0 to 0x8000800080008000.*/ |
|
143 psllw mm0,15 |
|
144 /*#1 Load low residue.*/ |
|
145 movq mm3,[2*8+RESIDUE] |
|
146 /*#1 Load high residue.*/ |
|
147 movq mm4,[3*8+RESIDUE] |
|
148 /*Set mm0 to 0x0080008000800080.*/ |
|
149 psrlw mm0,8 |
|
150 /*#2 Load low residue.*/ |
|
151 movq mm5,[4*8+RESIDUE] |
|
152 /*#2 Load high residue.*/ |
|
153 movq mm6,[5*8+RESIDUE] |
|
154 /*#0 Bias low residue.*/ |
|
155 paddsw mm1,mm0 |
|
156 /*#0 Bias high residue.*/ |
|
157 paddsw mm2,mm0 |
|
158 /*#0 Pack to byte.*/ |
|
159 packuswb mm1,mm2 |
|
160 /*#1 Bias low residue.*/ |
|
161 paddsw mm3,mm0 |
|
162 /*#1 Bias high residue.*/ |
|
163 paddsw mm4,mm0 |
|
164 /*#1 Pack to byte.*/ |
|
165 packuswb mm3,mm4 |
|
166 /*#2 Bias low residue.*/ |
|
167 paddsw mm5,mm0 |
|
168 /*#2 Bias high residue.*/ |
|
169 paddsw mm6,mm0 |
|
170 /*#2 Pack to byte.*/ |
|
171 packuswb mm5,mm6 |
|
172 /*#0 Write row.*/ |
|
173 movq [DST],mm1 |
|
174 /*#1 Write row.*/ |
|
175 movq [DST+YSTRIDE],mm3 |
|
176 /*#2 Write row.*/ |
|
177 movq [DST+YSTRIDE*2],mm5 |
|
178 /*#3 Load low residue.*/ |
|
179 movq mm1,[6*8+RESIDUE] |
|
180 /*#3 Load high residue.*/ |
|
181 movq mm2,[7*8+RESIDUE] |
|
182 /*#4 Load high residue.*/ |
|
183 movq mm3,[8*8+RESIDUE] |
|
184 /*#4 Load high residue.*/ |
|
185 movq mm4,[9*8+RESIDUE] |
|
186 /*#5 Load high residue.*/ |
|
187 movq mm5,[10*8+RESIDUE] |
|
188 /*#5 Load high residue.*/ |
|
189 movq mm6,[11*8+RESIDUE] |
|
190 /*#3 Bias low residue.*/ |
|
191 paddsw mm1,mm0 |
|
192 /*#3 Bias high residue.*/ |
|
193 paddsw mm2,mm0 |
|
194 /*#3 Pack to byte.*/ |
|
195 packuswb mm1,mm2 |
|
196 /*#4 Bias low residue.*/ |
|
197 paddsw mm3,mm0 |
|
198 /*#4 Bias high residue.*/ |
|
199 paddsw mm4,mm0 |
|
200 /*#4 Pack to byte.*/ |
|
201 packuswb mm3,mm4 |
|
202 /*#5 Bias low residue.*/ |
|
203 paddsw mm5,mm0 |
|
204 /*#5 Bias high residue.*/ |
|
205 paddsw mm6,mm0 |
|
206 /*#5 Pack to byte.*/ |
|
207 packuswb mm5,mm6 |
|
208 /*#3 Write row.*/ |
|
209 movq [DST+YSTRIDE3],mm1 |
|
210 /*#4 Write row.*/ |
|
211 movq [DST4],mm3 |
|
212 /*#5 Write row.*/ |
|
213 movq [DST4+YSTRIDE],mm5 |
|
214 /*#6 Load low residue.*/ |
|
215 movq mm1,[12*8+RESIDUE] |
|
216 /*#6 Load high residue.*/ |
|
217 movq mm2,[13*8+RESIDUE] |
|
218 /*#7 Load low residue.*/ |
|
219 movq mm3,[14*8+RESIDUE] |
|
220 /*#7 Load high residue.*/ |
|
221 movq mm4,[15*8+RESIDUE] |
|
222 /*#6 Bias low residue.*/ |
|
223 paddsw mm1,mm0 |
|
224 /*#6 Bias high residue.*/ |
|
225 paddsw mm2,mm0 |
|
226 /*#6 Pack to byte.*/ |
|
227 packuswb mm1,mm2 |
|
228 /*#7 Bias low residue.*/ |
|
229 paddsw mm3,mm0 |
|
230 /*#7 Bias high residue.*/ |
|
231 paddsw mm4,mm0 |
|
232 /*#7 Pack to byte.*/ |
|
233 packuswb mm3,mm4 |
|
234 /*#6 Write row.*/ |
|
235 movq [DST4+YSTRIDE*2],mm1 |
|
236 /*#7 Write row.*/ |
|
237 movq [DST4+YSTRIDE3],mm3 |
|
238 #undef DST |
|
239 #undef DST4 |
|
240 #undef YSTRIDE |
|
241 #undef YSTRIDE3 |
|
242 #undef RESIDUE |
|
243 } |
|
244 } |
|
245 |
|
246 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, |
|
247 int _ystride,const ogg_int16_t *_residue){ |
|
248 int i; |
|
249 /*Zero mm0.*/ |
|
250 __asm pxor mm0,mm0; |
|
251 for(i=4;i-->0;){ |
|
252 __asm{ |
|
253 #define DST edx |
|
254 #define SRC ecx |
|
255 #define YSTRIDE edi |
|
256 #define RESIDUE eax |
|
257 mov DST,_dst |
|
258 mov SRC,_src |
|
259 mov YSTRIDE,_ystride |
|
260 mov RESIDUE,_residue |
|
261 /*#0 Load source.*/ |
|
262 movq mm3,[SRC] |
|
263 /*#1 Load source.*/ |
|
264 movq mm7,[SRC+YSTRIDE] |
|
265 /*#0 Get copy of src.*/ |
|
266 movq mm4,mm3 |
|
267 /*#0 Expand high source.*/ |
|
268 punpckhbw mm4,mm0 |
|
269 /*#0 Expand low source.*/ |
|
270 punpcklbw mm3,mm0 |
|
271 /*#0 Add residue high.*/ |
|
272 paddsw mm4,[8+RESIDUE] |
|
273 /*#1 Get copy of src.*/ |
|
274 movq mm2,mm7 |
|
275 /*#0 Add residue low.*/ |
|
276 paddsw mm3,[RESIDUE] |
|
277 /*#1 Expand high source.*/ |
|
278 punpckhbw mm2,mm0 |
|
279 /*#0 Pack final row pixels.*/ |
|
280 packuswb mm3,mm4 |
|
281 /*#1 Expand low source.*/ |
|
282 punpcklbw mm7,mm0 |
|
283 /*#1 Add residue low.*/ |
|
284 paddsw mm7,[16+RESIDUE] |
|
285 /*#1 Add residue high.*/ |
|
286 paddsw mm2,[24+RESIDUE] |
|
287 /*Advance residue.*/ |
|
288 lea RESIDUE,[32+RESIDUE] |
|
289 /*#1 Pack final row pixels.*/ |
|
290 packuswb mm7,mm2 |
|
291 /*Advance src.*/ |
|
292 lea SRC,[SRC+YSTRIDE*2] |
|
293 /*#0 Write row.*/ |
|
294 movq [DST],mm3 |
|
295 /*#1 Write row.*/ |
|
296 movq [DST+YSTRIDE],mm7 |
|
297 /*Advance dst.*/ |
|
298 lea DST,[DST+YSTRIDE*2] |
|
299 mov _residue,RESIDUE |
|
300 mov _dst,DST |
|
301 mov _src,SRC |
|
302 #undef DST |
|
303 #undef SRC |
|
304 #undef YSTRIDE |
|
305 #undef RESIDUE |
|
306 } |
|
307 } |
|
308 } |
|
309 |
|
310 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, |
|
311 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ |
|
312 int i; |
|
313 /*Zero mm7.*/ |
|
314 __asm pxor mm7,mm7; |
|
315 for(i=4;i-->0;){ |
|
316 __asm{ |
|
317 #define SRC1 ecx |
|
318 #define SRC2 edi |
|
319 #define YSTRIDE esi |
|
320 #define RESIDUE edx |
|
321 #define DST eax |
|
322 mov YSTRIDE,_ystride |
|
323 mov DST,_dst |
|
324 mov RESIDUE,_residue |
|
325 mov SRC1,_src1 |
|
326 mov SRC2,_src2 |
|
327 /*#0 Load src1.*/ |
|
328 movq mm0,[SRC1] |
|
329 /*#0 Load src2.*/ |
|
330 movq mm2,[SRC2] |
|
331 /*#0 Copy src1.*/ |
|
332 movq mm1,mm0 |
|
333 /*#0 Copy src2.*/ |
|
334 movq mm3,mm2 |
|
335 /*#1 Load src1.*/ |
|
336 movq mm4,[SRC1+YSTRIDE] |
|
337 /*#0 Unpack lower src1.*/ |
|
338 punpcklbw mm0,mm7 |
|
339 /*#1 Load src2.*/ |
|
340 movq mm5,[SRC2+YSTRIDE] |
|
341 /*#0 Unpack higher src1.*/ |
|
342 punpckhbw mm1,mm7 |
|
343 /*#0 Unpack lower src2.*/ |
|
344 punpcklbw mm2,mm7 |
|
345 /*#0 Unpack higher src2.*/ |
|
346 punpckhbw mm3,mm7 |
|
347 /*Advance src1 ptr.*/ |
|
348 lea SRC1,[SRC1+YSTRIDE*2] |
|
349 /*Advance src2 ptr.*/ |
|
350 lea SRC2,[SRC2+YSTRIDE*2] |
|
351 /*#0 Lower src1+src2.*/ |
|
352 paddsw mm0,mm2 |
|
353 /*#0 Higher src1+src2.*/ |
|
354 paddsw mm1,mm3 |
|
355 /*#1 Copy src1.*/ |
|
356 movq mm2,mm4 |
|
357 /*#0 Build lo average.*/ |
|
358 psraw mm0,1 |
|
359 /*#1 Copy src2.*/ |
|
360 movq mm3,mm5 |
|
361 /*#1 Unpack lower src1.*/ |
|
362 punpcklbw mm4,mm7 |
|
363 /*#0 Build hi average.*/ |
|
364 psraw mm1,1 |
|
365 /*#1 Unpack higher src1.*/ |
|
366 punpckhbw mm2,mm7 |
|
367 /*#0 low+=residue.*/ |
|
368 paddsw mm0,[RESIDUE] |
|
369 /*#1 Unpack lower src2.*/ |
|
370 punpcklbw mm5,mm7 |
|
371 /*#0 high+=residue.*/ |
|
372 paddsw mm1,[8+RESIDUE] |
|
373 /*#1 Unpack higher src2.*/ |
|
374 punpckhbw mm3,mm7 |
|
375 /*#1 Lower src1+src2.*/ |
|
376 paddsw mm5,mm4 |
|
377 /*#0 Pack and saturate.*/ |
|
378 packuswb mm0,mm1 |
|
379 /*#1 Higher src1+src2.*/ |
|
380 paddsw mm3,mm2 |
|
381 /*#0 Write row.*/ |
|
382 movq [DST],mm0 |
|
383 /*#1 Build lo average.*/ |
|
384 psraw mm5,1 |
|
385 /*#1 Build hi average.*/ |
|
386 psraw mm3,1 |
|
387 /*#1 low+=residue.*/ |
|
388 paddsw mm5,[16+RESIDUE] |
|
389 /*#1 high+=residue.*/ |
|
390 paddsw mm3,[24+RESIDUE] |
|
391 /*#1 Pack and saturate.*/ |
|
392 packuswb mm5,mm3 |
|
393 /*#1 Write row ptr.*/ |
|
394 movq [DST+YSTRIDE],mm5 |
|
395 /*Advance residue ptr.*/ |
|
396 add RESIDUE,32 |
|
397 /*Advance dest ptr.*/ |
|
398 lea DST,[DST+YSTRIDE*2] |
|
399 mov _dst,DST |
|
400 mov _residue,RESIDUE |
|
401 mov _src1,SRC1 |
|
402 mov _src2,SRC2 |
|
403 #undef SRC1 |
|
404 #undef SRC2 |
|
405 #undef YSTRIDE |
|
406 #undef RESIDUE |
|
407 #undef DST |
|
408 } |
|
409 } |
|
410 } |
|
411 |
|
412 void oc_restore_fpu_mmx(void){ |
|
413 __asm emms; |
|
414 } |
|
415 |
|
416 #endif |