|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: mmxfrag.c 17410 2010-09-21 21:53:48Z tterribe $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 /*MMX acceleration of fragment reconstruction for motion compensation. |
|
19 Originally written by Rudolf Marek. |
|
20 Additional optimization by Nils Pipenbrinck. |
|
21 Note: Loops are unrolled for best performance. |
|
22 The iteration each instruction belongs to is marked in the comments as #i.*/ |
|
23 #include <stddef.h> |
|
24 #include "x86int.h" |
|
25 |
|
26 #if defined(OC_X86_ASM) |
|
27 |
|
28 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
|
29 between rows.*/ |
|
30 # define OC_FRAG_COPY_MMX(_dst,_src,_ystride) \ |
|
31 do{ \ |
|
32 const unsigned char *src; \ |
|
33 unsigned char *dst; \ |
|
34 ptrdiff_t ystride3; \ |
|
35 src=(_src); \ |
|
36 dst=(_dst); \ |
|
37 __asm__ __volatile__( \ |
|
38 /*src+0*ystride*/ \ |
|
39 "movq (%[src]),%%mm0\n\t" \ |
|
40 /*src+1*ystride*/ \ |
|
41 "movq (%[src],%[ystride]),%%mm1\n\t" \ |
|
42 /*ystride3=ystride*3*/ \ |
|
43 "lea (%[ystride],%[ystride],2),%[ystride3]\n\t" \ |
|
44 /*src+2*ystride*/ \ |
|
45 "movq (%[src],%[ystride],2),%%mm2\n\t" \ |
|
46 /*src+3*ystride*/ \ |
|
47 "movq (%[src],%[ystride3]),%%mm3\n\t" \ |
|
48 /*dst+0*ystride*/ \ |
|
49 "movq %%mm0,(%[dst])\n\t" \ |
|
50 /*dst+1*ystride*/ \ |
|
51 "movq %%mm1,(%[dst],%[ystride])\n\t" \ |
|
52 /*Pointer to next 4.*/ \ |
|
53 "lea (%[src],%[ystride],4),%[src]\n\t" \ |
|
54 /*dst+2*ystride*/ \ |
|
55 "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ |
|
56 /*dst+3*ystride*/ \ |
|
57 "movq %%mm3,(%[dst],%[ystride3])\n\t" \ |
|
58 /*Pointer to next 4.*/ \ |
|
59 "lea (%[dst],%[ystride],4),%[dst]\n\t" \ |
|
60 /*src+0*ystride*/ \ |
|
61 "movq (%[src]),%%mm0\n\t" \ |
|
62 /*src+1*ystride*/ \ |
|
63 "movq (%[src],%[ystride]),%%mm1\n\t" \ |
|
64 /*src+2*ystride*/ \ |
|
65 "movq (%[src],%[ystride],2),%%mm2\n\t" \ |
|
66 /*src+3*ystride*/ \ |
|
67 "movq (%[src],%[ystride3]),%%mm3\n\t" \ |
|
68 /*dst+0*ystride*/ \ |
|
69 "movq %%mm0,(%[dst])\n\t" \ |
|
70 /*dst+1*ystride*/ \ |
|
71 "movq %%mm1,(%[dst],%[ystride])\n\t" \ |
|
72 /*dst+2*ystride*/ \ |
|
73 "movq %%mm2,(%[dst],%[ystride],2)\n\t" \ |
|
74 /*dst+3*ystride*/ \ |
|
75 "movq %%mm3,(%[dst],%[ystride3])\n\t" \ |
|
76 :[dst]"+r"(dst),[src]"+r"(src),[ystride3]"=&r"(ystride3) \ |
|
77 :[ystride]"r"((ptrdiff_t)(_ystride)) \ |
|
78 :"memory" \ |
|
79 ); \ |
|
80 } \ |
|
81 while(0) |
|
82 |
|
83 /*Copies an 8x8 block of pixels from _src to _dst, assuming _ystride bytes |
|
84 between rows.*/ |
|
85 void oc_frag_copy_mmx(unsigned char *_dst, |
|
86 const unsigned char *_src,int _ystride){ |
|
87 OC_FRAG_COPY_MMX(_dst,_src,_ystride); |
|
88 } |
|
89 |
|
90 /*Copies the fragments specified by the lists of fragment indices from one |
|
91 frame to another. |
|
92 _dst_frame: The reference frame to copy to. |
|
93 _src_frame: The reference frame to copy from. |
|
94 _ystride: The row stride of the reference frames. |
|
95 _fragis: A pointer to a list of fragment indices. |
|
96 _nfragis: The number of fragment indices to copy. |
|
97 _frag_buf_offs: The offsets of fragments in the reference frames.*/ |
|
98 void oc_frag_copy_list_mmx(unsigned char *_dst_frame, |
|
99 const unsigned char *_src_frame,int _ystride, |
|
100 const ptrdiff_t *_fragis,ptrdiff_t _nfragis,const ptrdiff_t *_frag_buf_offs){ |
|
101 ptrdiff_t fragii; |
|
102 for(fragii=0;fragii<_nfragis;fragii++){ |
|
103 ptrdiff_t frag_buf_off; |
|
104 frag_buf_off=_frag_buf_offs[_fragis[fragii]]; |
|
105 OC_FRAG_COPY_MMX(_dst_frame+frag_buf_off, |
|
106 _src_frame+frag_buf_off,_ystride); |
|
107 } |
|
108 } |
|
109 |
|
110 |
|
111 void oc_frag_recon_intra_mmx(unsigned char *_dst,int _ystride, |
|
112 const ogg_int16_t *_residue){ |
|
113 __asm__ __volatile__( |
|
114 /*Set mm0 to 0xFFFFFFFFFFFFFFFF.*/ |
|
115 "pcmpeqw %%mm0,%%mm0\n\t" |
|
116 /*#0 Load low residue.*/ |
|
117 "movq 0*8(%[residue]),%%mm1\n\t" |
|
118 /*#0 Load high residue.*/ |
|
119 "movq 1*8(%[residue]),%%mm2\n\t" |
|
120 /*Set mm0 to 0x8000800080008000.*/ |
|
121 "psllw $15,%%mm0\n\t" |
|
122 /*#1 Load low residue.*/ |
|
123 "movq 2*8(%[residue]),%%mm3\n\t" |
|
124 /*#1 Load high residue.*/ |
|
125 "movq 3*8(%[residue]),%%mm4\n\t" |
|
126 /*Set mm0 to 0x0080008000800080.*/ |
|
127 "psrlw $8,%%mm0\n\t" |
|
128 /*#2 Load low residue.*/ |
|
129 "movq 4*8(%[residue]),%%mm5\n\t" |
|
130 /*#2 Load high residue.*/ |
|
131 "movq 5*8(%[residue]),%%mm6\n\t" |
|
132 /*#0 Bias low residue.*/ |
|
133 "paddsw %%mm0,%%mm1\n\t" |
|
134 /*#0 Bias high residue.*/ |
|
135 "paddsw %%mm0,%%mm2\n\t" |
|
136 /*#0 Pack to byte.*/ |
|
137 "packuswb %%mm2,%%mm1\n\t" |
|
138 /*#1 Bias low residue.*/ |
|
139 "paddsw %%mm0,%%mm3\n\t" |
|
140 /*#1 Bias high residue.*/ |
|
141 "paddsw %%mm0,%%mm4\n\t" |
|
142 /*#1 Pack to byte.*/ |
|
143 "packuswb %%mm4,%%mm3\n\t" |
|
144 /*#2 Bias low residue.*/ |
|
145 "paddsw %%mm0,%%mm5\n\t" |
|
146 /*#2 Bias high residue.*/ |
|
147 "paddsw %%mm0,%%mm6\n\t" |
|
148 /*#2 Pack to byte.*/ |
|
149 "packuswb %%mm6,%%mm5\n\t" |
|
150 /*#0 Write row.*/ |
|
151 "movq %%mm1,(%[dst])\n\t" |
|
152 /*#1 Write row.*/ |
|
153 "movq %%mm3,(%[dst],%[ystride])\n\t" |
|
154 /*#2 Write row.*/ |
|
155 "movq %%mm5,(%[dst],%[ystride],2)\n\t" |
|
156 /*#3 Load low residue.*/ |
|
157 "movq 6*8(%[residue]),%%mm1\n\t" |
|
158 /*#3 Load high residue.*/ |
|
159 "movq 7*8(%[residue]),%%mm2\n\t" |
|
160 /*#4 Load high residue.*/ |
|
161 "movq 8*8(%[residue]),%%mm3\n\t" |
|
162 /*#4 Load high residue.*/ |
|
163 "movq 9*8(%[residue]),%%mm4\n\t" |
|
164 /*#5 Load high residue.*/ |
|
165 "movq 10*8(%[residue]),%%mm5\n\t" |
|
166 /*#5 Load high residue.*/ |
|
167 "movq 11*8(%[residue]),%%mm6\n\t" |
|
168 /*#3 Bias low residue.*/ |
|
169 "paddsw %%mm0,%%mm1\n\t" |
|
170 /*#3 Bias high residue.*/ |
|
171 "paddsw %%mm0,%%mm2\n\t" |
|
172 /*#3 Pack to byte.*/ |
|
173 "packuswb %%mm2,%%mm1\n\t" |
|
174 /*#4 Bias low residue.*/ |
|
175 "paddsw %%mm0,%%mm3\n\t" |
|
176 /*#4 Bias high residue.*/ |
|
177 "paddsw %%mm0,%%mm4\n\t" |
|
178 /*#4 Pack to byte.*/ |
|
179 "packuswb %%mm4,%%mm3\n\t" |
|
180 /*#5 Bias low residue.*/ |
|
181 "paddsw %%mm0,%%mm5\n\t" |
|
182 /*#5 Bias high residue.*/ |
|
183 "paddsw %%mm0,%%mm6\n\t" |
|
184 /*#5 Pack to byte.*/ |
|
185 "packuswb %%mm6,%%mm5\n\t" |
|
186 /*#3 Write row.*/ |
|
187 "movq %%mm1,(%[dst],%[ystride3])\n\t" |
|
188 /*#4 Write row.*/ |
|
189 "movq %%mm3,(%[dst4])\n\t" |
|
190 /*#5 Write row.*/ |
|
191 "movq %%mm5,(%[dst4],%[ystride])\n\t" |
|
192 /*#6 Load low residue.*/ |
|
193 "movq 12*8(%[residue]),%%mm1\n\t" |
|
194 /*#6 Load high residue.*/ |
|
195 "movq 13*8(%[residue]),%%mm2\n\t" |
|
196 /*#7 Load low residue.*/ |
|
197 "movq 14*8(%[residue]),%%mm3\n\t" |
|
198 /*#7 Load high residue.*/ |
|
199 "movq 15*8(%[residue]),%%mm4\n\t" |
|
200 /*#6 Bias low residue.*/ |
|
201 "paddsw %%mm0,%%mm1\n\t" |
|
202 /*#6 Bias high residue.*/ |
|
203 "paddsw %%mm0,%%mm2\n\t" |
|
204 /*#6 Pack to byte.*/ |
|
205 "packuswb %%mm2,%%mm1\n\t" |
|
206 /*#7 Bias low residue.*/ |
|
207 "paddsw %%mm0,%%mm3\n\t" |
|
208 /*#7 Bias high residue.*/ |
|
209 "paddsw %%mm0,%%mm4\n\t" |
|
210 /*#7 Pack to byte.*/ |
|
211 "packuswb %%mm4,%%mm3\n\t" |
|
212 /*#6 Write row.*/ |
|
213 "movq %%mm1,(%[dst4],%[ystride],2)\n\t" |
|
214 /*#7 Write row.*/ |
|
215 "movq %%mm3,(%[dst4],%[ystride3])\n\t" |
|
216 : |
|
217 :[residue]"r"(_residue), |
|
218 [dst]"r"(_dst), |
|
219 [dst4]"r"(_dst+(_ystride<<2)), |
|
220 [ystride]"r"((ptrdiff_t)_ystride), |
|
221 [ystride3]"r"((ptrdiff_t)_ystride*3) |
|
222 :"memory" |
|
223 ); |
|
224 } |
|
225 |
|
226 void oc_frag_recon_inter_mmx(unsigned char *_dst,const unsigned char *_src, |
|
227 int _ystride,const ogg_int16_t *_residue){ |
|
228 int i; |
|
229 /*Zero mm0.*/ |
|
230 __asm__ __volatile__("pxor %%mm0,%%mm0\n\t"::); |
|
231 for(i=4;i-->0;){ |
|
232 __asm__ __volatile__( |
|
233 /*#0 Load source.*/ |
|
234 "movq (%[src]),%%mm3\n\t" |
|
235 /*#1 Load source.*/ |
|
236 "movq (%[src],%[ystride]),%%mm7\n\t" |
|
237 /*#0 Get copy of src.*/ |
|
238 "movq %%mm3,%%mm4\n\t" |
|
239 /*#0 Expand high source.*/ |
|
240 "punpckhbw %%mm0,%%mm4\n\t" |
|
241 /*#0 Expand low source.*/ |
|
242 "punpcklbw %%mm0,%%mm3\n\t" |
|
243 /*#0 Add residue high.*/ |
|
244 "paddsw 8(%[residue]),%%mm4\n\t" |
|
245 /*#1 Get copy of src.*/ |
|
246 "movq %%mm7,%%mm2\n\t" |
|
247 /*#0 Add residue low.*/ |
|
248 "paddsw (%[residue]), %%mm3\n\t" |
|
249 /*#1 Expand high source.*/ |
|
250 "punpckhbw %%mm0,%%mm2\n\t" |
|
251 /*#0 Pack final row pixels.*/ |
|
252 "packuswb %%mm4,%%mm3\n\t" |
|
253 /*#1 Expand low source.*/ |
|
254 "punpcklbw %%mm0,%%mm7\n\t" |
|
255 /*#1 Add residue low.*/ |
|
256 "paddsw 16(%[residue]),%%mm7\n\t" |
|
257 /*#1 Add residue high.*/ |
|
258 "paddsw 24(%[residue]),%%mm2\n\t" |
|
259 /*Advance residue.*/ |
|
260 "lea 32(%[residue]),%[residue]\n\t" |
|
261 /*#1 Pack final row pixels.*/ |
|
262 "packuswb %%mm2,%%mm7\n\t" |
|
263 /*Advance src.*/ |
|
264 "lea (%[src],%[ystride],2),%[src]\n\t" |
|
265 /*#0 Write row.*/ |
|
266 "movq %%mm3,(%[dst])\n\t" |
|
267 /*#1 Write row.*/ |
|
268 "movq %%mm7,(%[dst],%[ystride])\n\t" |
|
269 /*Advance dst.*/ |
|
270 "lea (%[dst],%[ystride],2),%[dst]\n\t" |
|
271 :[residue]"+r"(_residue),[dst]"+r"(_dst),[src]"+r"(_src) |
|
272 :[ystride]"r"((ptrdiff_t)_ystride) |
|
273 :"memory" |
|
274 ); |
|
275 } |
|
276 } |
|
277 |
|
278 void oc_frag_recon_inter2_mmx(unsigned char *_dst,const unsigned char *_src1, |
|
279 const unsigned char *_src2,int _ystride,const ogg_int16_t *_residue){ |
|
280 int i; |
|
281 /*Zero mm7.*/ |
|
282 __asm__ __volatile__("pxor %%mm7,%%mm7\n\t"::); |
|
283 for(i=4;i-->0;){ |
|
284 __asm__ __volatile__( |
|
285 /*#0 Load src1.*/ |
|
286 "movq (%[src1]),%%mm0\n\t" |
|
287 /*#0 Load src2.*/ |
|
288 "movq (%[src2]),%%mm2\n\t" |
|
289 /*#0 Copy src1.*/ |
|
290 "movq %%mm0,%%mm1\n\t" |
|
291 /*#0 Copy src2.*/ |
|
292 "movq %%mm2,%%mm3\n\t" |
|
293 /*#1 Load src1.*/ |
|
294 "movq (%[src1],%[ystride]),%%mm4\n\t" |
|
295 /*#0 Unpack lower src1.*/ |
|
296 "punpcklbw %%mm7,%%mm0\n\t" |
|
297 /*#1 Load src2.*/ |
|
298 "movq (%[src2],%[ystride]),%%mm5\n\t" |
|
299 /*#0 Unpack higher src1.*/ |
|
300 "punpckhbw %%mm7,%%mm1\n\t" |
|
301 /*#0 Unpack lower src2.*/ |
|
302 "punpcklbw %%mm7,%%mm2\n\t" |
|
303 /*#0 Unpack higher src2.*/ |
|
304 "punpckhbw %%mm7,%%mm3\n\t" |
|
305 /*Advance src1 ptr.*/ |
|
306 "lea (%[src1],%[ystride],2),%[src1]\n\t" |
|
307 /*Advance src2 ptr.*/ |
|
308 "lea (%[src2],%[ystride],2),%[src2]\n\t" |
|
309 /*#0 Lower src1+src2.*/ |
|
310 "paddsw %%mm2,%%mm0\n\t" |
|
311 /*#0 Higher src1+src2.*/ |
|
312 "paddsw %%mm3,%%mm1\n\t" |
|
313 /*#1 Copy src1.*/ |
|
314 "movq %%mm4,%%mm2\n\t" |
|
315 /*#0 Build lo average.*/ |
|
316 "psraw $1,%%mm0\n\t" |
|
317 /*#1 Copy src2.*/ |
|
318 "movq %%mm5,%%mm3\n\t" |
|
319 /*#1 Unpack lower src1.*/ |
|
320 "punpcklbw %%mm7,%%mm4\n\t" |
|
321 /*#0 Build hi average.*/ |
|
322 "psraw $1,%%mm1\n\t" |
|
323 /*#1 Unpack higher src1.*/ |
|
324 "punpckhbw %%mm7,%%mm2\n\t" |
|
325 /*#0 low+=residue.*/ |
|
326 "paddsw (%[residue]),%%mm0\n\t" |
|
327 /*#1 Unpack lower src2.*/ |
|
328 "punpcklbw %%mm7,%%mm5\n\t" |
|
329 /*#0 high+=residue.*/ |
|
330 "paddsw 8(%[residue]),%%mm1\n\t" |
|
331 /*#1 Unpack higher src2.*/ |
|
332 "punpckhbw %%mm7,%%mm3\n\t" |
|
333 /*#1 Lower src1+src2.*/ |
|
334 "paddsw %%mm4,%%mm5\n\t" |
|
335 /*#0 Pack and saturate.*/ |
|
336 "packuswb %%mm1,%%mm0\n\t" |
|
337 /*#1 Higher src1+src2.*/ |
|
338 "paddsw %%mm2,%%mm3\n\t" |
|
339 /*#0 Write row.*/ |
|
340 "movq %%mm0,(%[dst])\n\t" |
|
341 /*#1 Build lo average.*/ |
|
342 "psraw $1,%%mm5\n\t" |
|
343 /*#1 Build hi average.*/ |
|
344 "psraw $1,%%mm3\n\t" |
|
345 /*#1 low+=residue.*/ |
|
346 "paddsw 16(%[residue]),%%mm5\n\t" |
|
347 /*#1 high+=residue.*/ |
|
348 "paddsw 24(%[residue]),%%mm3\n\t" |
|
349 /*#1 Pack and saturate.*/ |
|
350 "packuswb %%mm3,%%mm5\n\t" |
|
351 /*#1 Write row ptr.*/ |
|
352 "movq %%mm5,(%[dst],%[ystride])\n\t" |
|
353 /*Advance residue ptr.*/ |
|
354 "add $32,%[residue]\n\t" |
|
355 /*Advance dest ptr.*/ |
|
356 "lea (%[dst],%[ystride],2),%[dst]\n\t" |
|
357 :[dst]"+r"(_dst),[residue]"+r"(_residue), |
|
358 [src1]"+%r"(_src1),[src2]"+r"(_src2) |
|
359 :[ystride]"r"((ptrdiff_t)_ystride) |
|
360 :"memory" |
|
361 ); |
|
362 } |
|
363 } |
|
364 |
|
365 void oc_restore_fpu_mmx(void){ |
|
366 __asm__ __volatile__("emms\n\t"); |
|
367 } |
|
368 #endif |