|
1 /******************************************************************** |
|
2 * * |
|
3 * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
|
4 * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
|
5 * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
|
6 * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
|
7 * * |
|
8 * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
|
9 * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
|
10 * * |
|
11 ******************************************************************** |
|
12 |
|
13 function: |
|
14 last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ |
|
15 |
|
16 ********************************************************************/ |
|
17 |
|
18 /*MMX acceleration of complete fragment reconstruction algorithm. |
|
19 Originally written by Rudolf Marek.*/ |
|
20 #include <string.h> |
|
21 #include "x86int.h" |
|
22 #include "mmxloop.h" |
|
23 |
|
24 #if defined(OC_X86_ASM) |
|
25 |
|
26 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, |
|
27 int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ |
|
28 unsigned char *dst; |
|
29 ptrdiff_t frag_buf_off; |
|
30 int ystride; |
|
31 int refi; |
|
32 /*Apply the inverse transform.*/ |
|
33 /*Special case only having a DC component.*/ |
|
34 if(_last_zzi<2){ |
|
35 /*Note that this value must be unsigned, to keep the __asm__ block from |
|
36 sign-extending it when it puts it in a register.*/ |
|
37 ogg_uint16_t p; |
|
38 int i; |
|
39 /*We round this dequant product (and not any of the others) because there's |
|
40 no iDCT rounding.*/ |
|
41 p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); |
|
42 /*Fill _dct_coeffs with p.*/ |
|
43 __asm__ __volatile__( |
|
44 /*mm0=0000 0000 0000 AAAA*/ |
|
45 "movd %[p],%%mm0\n\t" |
|
46 /*mm0=0000 0000 AAAA AAAA*/ |
|
47 "punpcklwd %%mm0,%%mm0\n\t" |
|
48 /*mm0=AAAA AAAA AAAA AAAA*/ |
|
49 "punpckldq %%mm0,%%mm0\n\t" |
|
50 : |
|
51 :[p]"r"((unsigned)p) |
|
52 ); |
|
53 for(i=0;i<4;i++){ |
|
54 __asm__ __volatile__( |
|
55 "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t" |
|
56 "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t" |
|
57 "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t" |
|
58 "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t" |
|
59 :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16) |
|
60 ); |
|
61 } |
|
62 } |
|
63 else{ |
|
64 /*Dequantize the DC coefficient.*/ |
|
65 _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); |
|
66 oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); |
|
67 } |
|
68 /*Fill in the target buffer.*/ |
|
69 frag_buf_off=_state->frag_buf_offs[_fragi]; |
|
70 refi=_state->frags[_fragi].refi; |
|
71 ystride=_state->ref_ystride[_pli]; |
|
72 dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; |
|
73 if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); |
|
74 else{ |
|
75 const unsigned char *ref; |
|
76 int mvoffsets[2]; |
|
77 ref=_state->ref_frame_data[refi]+frag_buf_off; |
|
78 if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, |
|
79 _state->frag_mvs[_fragi])>1){ |
|
80 oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, |
|
81 _dct_coeffs+64); |
|
82 } |
|
83 else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); |
|
84 } |
|
85 } |
|
86 |
|
87 /*We copy these entire function to inline the actual MMX routines so that we |
|
88 use only a single indirect call.*/ |
|
89 |
|
90 void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ |
|
91 memset(_bv,_flimit,8); |
|
92 } |
|
93 |
|
94 /*Apply the loop filter to a given set of fragment rows in the given plane. |
|
95 The filter may be run on the bottom edge, affecting pixels in the next row of |
|
96 fragments, so this row also needs to be available. |
|
97 _bv: The bounding values array. |
|
98 _refi: The index of the frame buffer to filter. |
|
99 _pli: The color plane to filter. |
|
100 _fragy0: The Y coordinate of the first fragment row to filter. |
|
101 _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ |
|
102 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, |
|
103 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ |
|
104 OC_ALIGN8(unsigned char ll[8]); |
|
105 const oc_fragment_plane *fplane; |
|
106 const oc_fragment *frags; |
|
107 const ptrdiff_t *frag_buf_offs; |
|
108 unsigned char *ref_frame_data; |
|
109 ptrdiff_t fragi_top; |
|
110 ptrdiff_t fragi_bot; |
|
111 ptrdiff_t fragi0; |
|
112 ptrdiff_t fragi0_end; |
|
113 int ystride; |
|
114 int nhfrags; |
|
115 memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); |
|
116 fplane=_state->fplanes+_pli; |
|
117 nhfrags=fplane->nhfrags; |
|
118 fragi_top=fplane->froffset; |
|
119 fragi_bot=fragi_top+fplane->nfrags; |
|
120 fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; |
|
121 fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; |
|
122 ystride=_state->ref_ystride[_pli]; |
|
123 frags=_state->frags; |
|
124 frag_buf_offs=_state->frag_buf_offs; |
|
125 ref_frame_data=_state->ref_frame_data[_refi]; |
|
126 /*The following loops are constructed somewhat non-intuitively on purpose. |
|
127 The main idea is: if a block boundary has at least one coded fragment on |
|
128 it, the filter is applied to it. |
|
129 However, the order that the filters are applied in matters, and VP3 chose |
|
130 the somewhat strange ordering used below.*/ |
|
131 while(fragi0<fragi0_end){ |
|
132 ptrdiff_t fragi; |
|
133 ptrdiff_t fragi_end; |
|
134 fragi=fragi0; |
|
135 fragi_end=fragi+nhfrags; |
|
136 while(fragi<fragi_end){ |
|
137 if(frags[fragi].coded){ |
|
138 unsigned char *ref; |
|
139 ref=ref_frame_data+frag_buf_offs[fragi]; |
|
140 if(fragi>fragi0){ |
|
141 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll); |
|
142 } |
|
143 if(fragi0>fragi_top){ |
|
144 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll); |
|
145 } |
|
146 if(fragi+1<fragi_end&&!frags[fragi+1].coded){ |
|
147 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll); |
|
148 } |
|
149 if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ |
|
150 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll); |
|
151 } |
|
152 } |
|
153 fragi++; |
|
154 } |
|
155 fragi0+=nhfrags; |
|
156 } |
|
157 } |
|
158 |
|
159 void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){ |
|
160 memset(_bv,~(_flimit<<1),8); |
|
161 } |
|
162 |
|
163 /*Apply the loop filter to a given set of fragment rows in the given plane. |
|
164 The filter may be run on the bottom edge, affecting pixels in the next row of |
|
165 fragments, so this row also needs to be available. |
|
166 _bv: The bounding values array. |
|
167 _refi: The index of the frame buffer to filter. |
|
168 _pli: The color plane to filter. |
|
169 _fragy0: The Y coordinate of the first fragment row to filter. |
|
170 _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ |
|
171 void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, |
|
172 signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ |
|
173 const oc_fragment_plane *fplane; |
|
174 const oc_fragment *frags; |
|
175 const ptrdiff_t *frag_buf_offs; |
|
176 unsigned char *ref_frame_data; |
|
177 ptrdiff_t fragi_top; |
|
178 ptrdiff_t fragi_bot; |
|
179 ptrdiff_t fragi0; |
|
180 ptrdiff_t fragi0_end; |
|
181 int ystride; |
|
182 int nhfrags; |
|
183 fplane=_state->fplanes+_pli; |
|
184 nhfrags=fplane->nhfrags; |
|
185 fragi_top=fplane->froffset; |
|
186 fragi_bot=fragi_top+fplane->nfrags; |
|
187 fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; |
|
188 fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; |
|
189 ystride=_state->ref_ystride[_pli]; |
|
190 frags=_state->frags; |
|
191 frag_buf_offs=_state->frag_buf_offs; |
|
192 ref_frame_data=_state->ref_frame_data[_refi]; |
|
193 /*The following loops are constructed somewhat non-intuitively on purpose. |
|
194 The main idea is: if a block boundary has at least one coded fragment on |
|
195 it, the filter is applied to it. |
|
196 However, the order that the filters are applied in matters, and VP3 chose |
|
197 the somewhat strange ordering used below.*/ |
|
198 while(fragi0<fragi0_end){ |
|
199 ptrdiff_t fragi; |
|
200 ptrdiff_t fragi_end; |
|
201 fragi=fragi0; |
|
202 fragi_end=fragi+nhfrags; |
|
203 while(fragi<fragi_end){ |
|
204 if(frags[fragi].coded){ |
|
205 unsigned char *ref; |
|
206 ref=ref_frame_data+frag_buf_offs[fragi]; |
|
207 if(fragi>fragi0){ |
|
208 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); |
|
209 } |
|
210 if(fragi0>fragi_top){ |
|
211 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); |
|
212 } |
|
213 if(fragi+1<fragi_end&&!frags[fragi+1].coded){ |
|
214 OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv); |
|
215 } |
|
216 if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ |
|
217 OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv); |
|
218 } |
|
219 } |
|
220 fragi++; |
|
221 } |
|
222 fragi0+=nhfrags; |
|
223 } |
|
224 } |
|
225 |
|
226 #endif |