michael@0: /******************************************************************** michael@0: * * michael@0: * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * michael@0: * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * michael@0: * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * michael@0: * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * michael@0: * * michael@0: * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * michael@0: * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * michael@0: * * michael@0: ******************************************************************** michael@0: michael@0: function: michael@0: last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ michael@0: michael@0: ********************************************************************/ michael@0: michael@0: /*MMX acceleration of complete fragment reconstruction algorithm. michael@0: Originally written by Rudolf Marek.*/ michael@0: #include michael@0: #include "x86int.h" michael@0: #include "mmxloop.h" michael@0: michael@0: #if defined(OC_X86_ASM) michael@0: michael@0: void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, michael@0: int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ michael@0: unsigned char *dst; michael@0: ptrdiff_t frag_buf_off; michael@0: int ystride; michael@0: int refi; michael@0: /*Apply the inverse transform.*/ michael@0: /*Special case only having a DC component.*/ michael@0: if(_last_zzi<2){ michael@0: /*Note that this value must be unsigned, to keep the __asm__ block from michael@0: sign-extending it when it puts it in a register.*/ michael@0: ogg_uint16_t p; michael@0: int i; michael@0: /*We round this dequant product (and not any of the others) because there's michael@0: no iDCT rounding.*/ michael@0: p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); michael@0: /*Fill _dct_coeffs with p.*/ michael@0: __asm__ __volatile__( michael@0: /*mm0=0000 0000 0000 AAAA*/ michael@0: "movd %[p],%%mm0\n\t" michael@0: /*mm0=0000 0000 AAAA AAAA*/ michael@0: "punpcklwd %%mm0,%%mm0\n\t" michael@0: /*mm0=AAAA AAAA AAAA AAAA*/ michael@0: "punpckldq %%mm0,%%mm0\n\t" michael@0: : michael@0: :[p]"r"((unsigned)p) michael@0: ); michael@0: for(i=0;i<4;i++){ michael@0: __asm__ __volatile__( michael@0: "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t" michael@0: "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t" michael@0: :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16) michael@0: ); michael@0: } michael@0: } michael@0: else{ michael@0: /*Dequantize the DC coefficient.*/ michael@0: _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); michael@0: oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi); michael@0: } michael@0: /*Fill in the target buffer.*/ michael@0: frag_buf_off=_state->frag_buf_offs[_fragi]; michael@0: refi=_state->frags[_fragi].refi; michael@0: ystride=_state->ref_ystride[_pli]; michael@0: dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; michael@0: if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); michael@0: else{ michael@0: const unsigned char *ref; michael@0: int mvoffsets[2]; michael@0: ref=_state->ref_frame_data[refi]+frag_buf_off; michael@0: if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, michael@0: _state->frag_mvs[_fragi])>1){ michael@0: oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, michael@0: _dct_coeffs+64); michael@0: } michael@0: else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); michael@0: } michael@0: } michael@0: michael@0: /*We copy these entire function to inline the actual MMX routines so that we michael@0: use only a single indirect call.*/ michael@0: michael@0: void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ michael@0: memset(_bv,_flimit,8); michael@0: } michael@0: michael@0: /*Apply the loop filter to a given set of fragment rows in the given plane. michael@0: The filter may be run on the bottom edge, affecting pixels in the next row of michael@0: fragments, so this row also needs to be available. michael@0: _bv: The bounding values array. michael@0: _refi: The index of the frame buffer to filter. michael@0: _pli: The color plane to filter. michael@0: _fragy0: The Y coordinate of the first fragment row to filter. michael@0: _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ michael@0: void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, michael@0: signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ michael@0: OC_ALIGN8(unsigned char ll[8]); michael@0: const oc_fragment_plane *fplane; michael@0: const oc_fragment *frags; michael@0: const ptrdiff_t *frag_buf_offs; michael@0: unsigned char *ref_frame_data; michael@0: ptrdiff_t fragi_top; michael@0: ptrdiff_t fragi_bot; michael@0: ptrdiff_t fragi0; michael@0: ptrdiff_t fragi0_end; michael@0: int ystride; michael@0: int nhfrags; michael@0: memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll)); michael@0: fplane=_state->fplanes+_pli; michael@0: nhfrags=fplane->nhfrags; michael@0: fragi_top=fplane->froffset; michael@0: fragi_bot=fragi_top+fplane->nfrags; michael@0: fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; michael@0: fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags; michael@0: ystride=_state->ref_ystride[_pli]; michael@0: frags=_state->frags; michael@0: frag_buf_offs=_state->frag_buf_offs; michael@0: ref_frame_data=_state->ref_frame_data[_refi]; michael@0: /*The following loops are constructed somewhat non-intuitively on purpose. michael@0: The main idea is: if a block boundary has at least one coded fragment on michael@0: it, the filter is applied to it. michael@0: However, the order that the filters are applied in matters, and VP3 chose michael@0: the somewhat strange ordering used below.*/ michael@0: while(fragi0fragi0){ michael@0: OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll); michael@0: } michael@0: if(fragi0>fragi_top){ michael@0: OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll); michael@0: } michael@0: if(fragi+1fplanes+_pli; michael@0: nhfrags=fplane->nhfrags; michael@0: fragi_top=fplane->froffset; michael@0: fragi_bot=fragi_top+fplane->nfrags; michael@0: fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; michael@0: fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; michael@0: ystride=_state->ref_ystride[_pli]; michael@0: frags=_state->frags; michael@0: frag_buf_offs=_state->frag_buf_offs; michael@0: ref_frame_data=_state->ref_frame_data[_refi]; michael@0: /*The following loops are constructed somewhat non-intuitively on purpose. michael@0: The main idea is: if a block boundary has at least one coded fragment on michael@0: it, the filter is applied to it. michael@0: However, the order that the filters are applied in matters, and VP3 chose michael@0: the somewhat strange ordering used below.*/ michael@0: while(fragi0fragi0){ michael@0: OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); michael@0: } michael@0: if(fragi0>fragi_top){ michael@0: OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv); michael@0: } michael@0: if(fragi+1