1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000 1.2 +++ b/media/libtheora/lib/x86/mmxstate.c Wed Dec 31 06:09:35 2014 +0100 1.3 @@ -0,0 +1,226 @@ 1.4 +/******************************************************************** 1.5 + * * 1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * 1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * 1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * 1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * 1.10 + * * 1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * 1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * 1.13 + * * 1.14 + ******************************************************************** 1.15 + 1.16 + function: 1.17 + last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ 1.18 + 1.19 + ********************************************************************/ 1.20 + 1.21 +/*MMX acceleration of complete fragment reconstruction algorithm. 1.22 + Originally written by Rudolf Marek.*/ 1.23 +#include <string.h> 1.24 +#include "x86int.h" 1.25 +#include "mmxloop.h" 1.26 + 1.27 +#if defined(OC_X86_ASM) 1.28 + 1.29 +void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, 1.30 + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ 1.31 + unsigned char *dst; 1.32 + ptrdiff_t frag_buf_off; 1.33 + int ystride; 1.34 + int refi; 1.35 + /*Apply the inverse transform.*/ 1.36 + /*Special case only having a DC component.*/ 1.37 + if(_last_zzi<2){ 1.38 + /*Note that this value must be unsigned, to keep the __asm__ block from 1.39 + sign-extending it when it puts it in a register.*/ 1.40 + ogg_uint16_t p; 1.41 + int i; 1.42 + /*We round this dequant product (and not any of the others) because there's 1.43 + no iDCT rounding.*/ 1.44 + p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); 1.45 + /*Fill _dct_coeffs with p.*/ 1.46 + __asm__ __volatile__( 1.47 + /*mm0=0000 0000 0000 AAAA*/ 1.48 + "movd %[p],%%mm0\n\t" 1.49 + /*mm0=0000 0000 AAAA AAAA*/ 1.50 + "punpcklwd %%mm0,%%mm0\n\t" 1.51 + /*mm0=AAAA AAAA AAAA AAAA*/ 1.52 + "punpckldq %%mm0,%%mm0\n\t" 1.53 + : 1.54 + :[p]"r"((unsigned)p) 1.55 + ); 1.56 + for(i=0;i<4;i++){ 1.57 + __asm__ __volatile__( 1.58 + "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t" 1.59 + "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t" 1.60 + "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t" 1.61 + "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t" 1.62 + :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16) 1.63 + ); 1.64 + } 1.65 + } 1.66 + else{ 1.67 + /*Dequantize the DC coefficient.*/ 1.68 + _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); 1.69 + oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); 1.70 + } 1.71 + /*Fill in the target buffer.*/ 1.72 + frag_buf_off=_state->frag_buf_offs[_fragi]; 1.73 + refi=_state->frags[_fragi].refi; 1.74 + ystride=_state->ref_ystride[_pli]; 1.75 + dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; 1.76 + if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); 1.77 + else{ 1.78 + const unsigned char *ref; 1.79 + int mvoffsets[2]; 1.80 + ref=_state->ref_frame_data[refi]+frag_buf_off; 1.81 + if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, 1.82 + _state->frag_mvs[_fragi])>1){ 1.83 + oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, 1.84 + _dct_coeffs+64); 1.85 + } 1.86 + else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); 1.87 + } 1.88 +} 1.89 + 1.90 +/*We copy these entire function to inline the actual MMX routines so that we 1.91 + use only a single indirect call.*/ 1.92 + 1.93 +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ 1.94 + memset(_bv,_flimit,8); 1.95 +} 1.96 + 1.97 +/*Apply the loop filter to a given set of fragment rows in the given plane. 1.98 + The filter may be run on the bottom edge, affecting pixels in the next row of 1.99 + fragments, so this row also needs to be available. 1.100 + _bv: The bounding values array. 1.101 + _refi: The index of the frame buffer to filter. 1.102 + _pli: The color plane to filter. 1.103 + _fragy0: The Y coordinate of the first fragment row to filter. 1.104 + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ 1.105 +void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, 1.106 + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ 1.107 + OC_ALIGN8(unsigned char ll[8]); 1.108 + const oc_fragment_plane *fplane; 1.109 + const oc_fragment *frags; 1.110 + const ptrdiff_t *frag_buf_offs; 1.111 + unsigned char *ref_frame_data; 1.112 + ptrdiff_t fragi_top; 1.113 + ptrdiff_t fragi_bot; 1.114 + ptrdiff_t fragi0; 1.115 + ptrdiff_t fragi0_end; 1.116 + int ystride; 1.117 + int nhfrags; 1.118 + memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); 1.119 + fplane=_state->fplanes+_pli; 1.120 + nhfrags=fplane->nhfrags; 1.121 + fragi_top=fplane->froffset; 1.122 + fragi_bot=fragi_top+fplane->nfrags; 1.123 + fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; 1.124 + fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; 1.125 + ystride=_state->ref_ystride[_pli]; 1.126 + frags=_state->frags; 1.127 + frag_buf_offs=_state->frag_buf_offs; 1.128 + ref_frame_data=_state->ref_frame_data[_refi]; 1.129 + /*The following loops are constructed somewhat non-intuitively on purpose. 1.130 + The main idea is: if a block boundary has at least one coded fragment on 1.131 + it, the filter is applied to it. 1.132 + However, the order that the filters are applied in matters, and VP3 chose 1.133 + the somewhat strange ordering used below.*/ 1.134 + while(fragi0<fragi0_end){ 1.135 + ptrdiff_t fragi; 1.136 + ptrdiff_t fragi_end; 1.137 + fragi=fragi0; 1.138 + fragi_end=fragi+nhfrags; 1.139 + while(fragi<fragi_end){ 1.140 + if(frags[fragi].coded){ 1.141 + unsigned char *ref; 1.142 + ref=ref_frame_data+frag_buf_offs[fragi]; 1.143 + if(fragi>fragi0){ 1.144 + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll); 1.145 + } 1.146 + if(fragi0>fragi_top){ 1.147 + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll); 1.148 + } 1.149 + if(fragi+1<fragi_end&&!frags[fragi+1].coded){ 1.150 + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll); 1.151 + } 1.152 + if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ 1.153 + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll); 1.154 + } 1.155 + } 1.156 + fragi++; 1.157 + } 1.158 + fragi0+=nhfrags; 1.159 + } 1.160 +} 1.161 + 1.162 +void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){ 1.163 + memset(_bv,~(_flimit<<1),8); 1.164 +} 1.165 + 1.166 +/*Apply the loop filter to a given set of fragment rows in the given plane. 1.167 + The filter may be run on the bottom edge, affecting pixels in the next row of 1.168 + fragments, so this row also needs to be available. 1.169 + _bv: The bounding values array. 1.170 + _refi: The index of the frame buffer to filter. 1.171 + _pli: The color plane to filter. 1.172 + _fragy0: The Y coordinate of the first fragment row to filter. 1.173 + _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ 1.174 +void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state, 1.175 + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ 1.176 + const oc_fragment_plane *fplane; 1.177 + const oc_fragment *frags; 1.178 + const ptrdiff_t *frag_buf_offs; 1.179 + unsigned char *ref_frame_data; 1.180 + ptrdiff_t fragi_top; 1.181 + ptrdiff_t fragi_bot; 1.182 + ptrdiff_t fragi0; 1.183 + ptrdiff_t fragi0_end; 1.184 + int ystride; 1.185 + int nhfrags; 1.186 + fplane=_state->fplanes+_pli; 1.187 + nhfrags=fplane->nhfrags; 1.188 + fragi_top=fplane->froffset; 1.189 + fragi_bot=fragi_top+fplane->nfrags; 1.190 + fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; 1.191 + fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; 1.192 + ystride=_state->ref_ystride[_pli]; 1.193 + frags=_state->frags; 1.194 + frag_buf_offs=_state->frag_buf_offs; 1.195 + ref_frame_data=_state->ref_frame_data[_refi]; 1.196 + /*The following loops are constructed somewhat non-intuitively on purpose. 1.197 + The main idea is: if a block boundary has at least one coded fragment on 1.198 + it, the filter is applied to it. 1.199 + However, the order that the filters are applied in matters, and VP3 chose 1.200 + the somewhat strange ordering used below.*/ 1.201 + while(fragi0<fragi0_end){ 1.202 + ptrdiff_t fragi; 1.203 + ptrdiff_t fragi_end; 1.204 + fragi=fragi0; 1.205 + fragi_end=fragi+nhfrags; 1.206 + while(fragi<fragi_end){ 1.207 + if(frags[fragi].coded){ 1.208 + unsigned char *ref; 1.209 + ref=ref_frame_data+frag_buf_offs[fragi]; 1.210 + if(fragi>fragi0){ 1.211 + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); 1.212 + } 1.213 + if(fragi0>fragi_top){ 1.214 + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); 1.215 + } 1.216 + if(fragi+1<fragi_end&&!frags[fragi+1].coded){ 1.217 + OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv); 1.218 + } 1.219 + if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ 1.220 + OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv); 1.221 + } 1.222 + } 1.223 + fragi++; 1.224 + } 1.225 + fragi0+=nhfrags; 1.226 + } 1.227 +} 1.228 + 1.229 +#endif