media/libtheora/lib/x86/mmxstate.c

changeset 0
6474c204b198
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/media/libtheora/lib/x86/mmxstate.c	Wed Dec 31 06:09:35 2014 +0100
     1.3 @@ -0,0 +1,226 @@
     1.4 +/********************************************************************
     1.5 + *                                                                  *
     1.6 + * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     1.7 + * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     1.8 + * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     1.9 + * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
    1.10 + *                                                                  *
    1.11 + * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
    1.12 + * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    1.13 + *                                                                  *
    1.14 + ********************************************************************
    1.15 +
    1.16 +  function:
    1.17 +    last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $
    1.18 +
    1.19 + ********************************************************************/
    1.20 +
    1.21 +/*MMX acceleration of complete fragment reconstruction algorithm.
    1.22 +  Originally written by Rudolf Marek.*/
    1.23 +#include <string.h>
    1.24 +#include "x86int.h"
    1.25 +#include "mmxloop.h"
    1.26 +
    1.27 +#if defined(OC_X86_ASM)
    1.28 +
    1.29 +void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
    1.30 + int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
    1.31 +  unsigned char *dst;
    1.32 +  ptrdiff_t      frag_buf_off;
    1.33 +  int            ystride;
    1.34 +  int            refi;
    1.35 +  /*Apply the inverse transform.*/
    1.36 +  /*Special case only having a DC component.*/
    1.37 +  if(_last_zzi<2){
    1.38 +    /*Note that this value must be unsigned, to keep the __asm__ block from
    1.39 +       sign-extending it when it puts it in a register.*/
    1.40 +    ogg_uint16_t p;
    1.41 +    int          i;
    1.42 +    /*We round this dequant product (and not any of the others) because there's
    1.43 +       no iDCT rounding.*/
    1.44 +    p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
    1.45 +    /*Fill _dct_coeffs with p.*/
    1.46 +    __asm__ __volatile__(
    1.47 +      /*mm0=0000 0000 0000 AAAA*/
    1.48 +      "movd %[p],%%mm0\n\t"
    1.49 +      /*mm0=0000 0000 AAAA AAAA*/
    1.50 +      "punpcklwd %%mm0,%%mm0\n\t"
    1.51 +      /*mm0=AAAA AAAA AAAA AAAA*/
    1.52 +      "punpckldq %%mm0,%%mm0\n\t"
    1.53 +      :
    1.54 +      :[p]"r"((unsigned)p)
    1.55 +    );
    1.56 +    for(i=0;i<4;i++){
    1.57 +      __asm__ __volatile__(
    1.58 +        "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
    1.59 +        "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
    1.60 +        "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
    1.61 +        "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
    1.62 +        :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
    1.63 +      );
    1.64 +    }
    1.65 +  }
    1.66 +  else{
    1.67 +    /*Dequantize the DC coefficient.*/
    1.68 +    _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
    1.69 +    oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
    1.70 +  }
    1.71 +  /*Fill in the target buffer.*/
    1.72 +  frag_buf_off=_state->frag_buf_offs[_fragi];
    1.73 +  refi=_state->frags[_fragi].refi;
    1.74 +  ystride=_state->ref_ystride[_pli];
    1.75 +  dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
    1.76 +  if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
    1.77 +  else{
    1.78 +    const unsigned char *ref;
    1.79 +    int                  mvoffsets[2];
    1.80 +    ref=_state->ref_frame_data[refi]+frag_buf_off;
    1.81 +    if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
    1.82 +     _state->frag_mvs[_fragi])>1){
    1.83 +      oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
    1.84 +       _dct_coeffs+64);
    1.85 +    }
    1.86 +    else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
    1.87 +  }
    1.88 +}
    1.89 +
    1.90 +/*We copy these entire function to inline the actual MMX routines so that we
    1.91 +   use only a single indirect call.*/
    1.92 +
    1.93 +void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
    1.94 +  memset(_bv,_flimit,8);
    1.95 +}
    1.96 +
    1.97 +/*Apply the loop filter to a given set of fragment rows in the given plane.
    1.98 +  The filter may be run on the bottom edge, affecting pixels in the next row of
    1.99 +   fragments, so this row also needs to be available.
   1.100 +  _bv:        The bounding values array.
   1.101 +  _refi:      The index of the frame buffer to filter.
   1.102 +  _pli:       The color plane to filter.
   1.103 +  _fragy0:    The Y coordinate of the first fragment row to filter.
   1.104 +  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
   1.105 +void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
   1.106 + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   1.107 +  OC_ALIGN8(unsigned char   ll[8]);
   1.108 +  const oc_fragment_plane *fplane;
   1.109 +  const oc_fragment       *frags;
   1.110 +  const ptrdiff_t         *frag_buf_offs;
   1.111 +  unsigned char           *ref_frame_data;
   1.112 +  ptrdiff_t                fragi_top;
   1.113 +  ptrdiff_t                fragi_bot;
   1.114 +  ptrdiff_t                fragi0;
   1.115 +  ptrdiff_t                fragi0_end;
   1.116 +  int                      ystride;
   1.117 +  int                      nhfrags;
   1.118 +  memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
   1.119 +  fplane=_state->fplanes+_pli;
   1.120 +  nhfrags=fplane->nhfrags;
   1.121 +  fragi_top=fplane->froffset;
   1.122 +  fragi_bot=fragi_top+fplane->nfrags;
   1.123 +  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
   1.124 +  fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
   1.125 +  ystride=_state->ref_ystride[_pli];
   1.126 +  frags=_state->frags;
   1.127 +  frag_buf_offs=_state->frag_buf_offs;
   1.128 +  ref_frame_data=_state->ref_frame_data[_refi];
   1.129 +  /*The following loops are constructed somewhat non-intuitively on purpose.
   1.130 +    The main idea is: if a block boundary has at least one coded fragment on
   1.131 +     it, the filter is applied to it.
   1.132 +    However, the order that the filters are applied in matters, and VP3 chose
   1.133 +     the somewhat strange ordering used below.*/
   1.134 +  while(fragi0<fragi0_end){
   1.135 +    ptrdiff_t fragi;
   1.136 +    ptrdiff_t fragi_end;
   1.137 +    fragi=fragi0;
   1.138 +    fragi_end=fragi+nhfrags;
   1.139 +    while(fragi<fragi_end){
   1.140 +      if(frags[fragi].coded){
   1.141 +        unsigned char *ref;
   1.142 +        ref=ref_frame_data+frag_buf_offs[fragi];
   1.143 +        if(fragi>fragi0){
   1.144 +          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
   1.145 +        }
   1.146 +        if(fragi0>fragi_top){
   1.147 +          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
   1.148 +        }
   1.149 +        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
   1.150 +          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
   1.151 +        }
   1.152 +        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
   1.153 +          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
   1.154 +        }
   1.155 +      }
   1.156 +      fragi++;
   1.157 +    }
   1.158 +    fragi0+=nhfrags;
   1.159 +  }
   1.160 +}
   1.161 +
   1.162 +void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
   1.163 +  memset(_bv,~(_flimit<<1),8);
   1.164 +}
   1.165 +
   1.166 +/*Apply the loop filter to a given set of fragment rows in the given plane.
   1.167 +  The filter may be run on the bottom edge, affecting pixels in the next row of
   1.168 +   fragments, so this row also needs to be available.
   1.169 +  _bv:        The bounding values array.
   1.170 +  _refi:      The index of the frame buffer to filter.
   1.171 +  _pli:       The color plane to filter.
   1.172 +  _fragy0:    The Y coordinate of the first fragment row to filter.
   1.173 +  _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
   1.174 +void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
   1.175 + signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   1.176 +  const oc_fragment_plane *fplane;
   1.177 +  const oc_fragment       *frags;
   1.178 +  const ptrdiff_t         *frag_buf_offs;
   1.179 +  unsigned char           *ref_frame_data;
   1.180 +  ptrdiff_t                fragi_top;
   1.181 +  ptrdiff_t                fragi_bot;
   1.182 +  ptrdiff_t                fragi0;
   1.183 +  ptrdiff_t                fragi0_end;
   1.184 +  int                      ystride;
   1.185 +  int                      nhfrags;
   1.186 +  fplane=_state->fplanes+_pli;
   1.187 +  nhfrags=fplane->nhfrags;
   1.188 +  fragi_top=fplane->froffset;
   1.189 +  fragi_bot=fragi_top+fplane->nfrags;
   1.190 +  fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
   1.191 +  fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   1.192 +  ystride=_state->ref_ystride[_pli];
   1.193 +  frags=_state->frags;
   1.194 +  frag_buf_offs=_state->frag_buf_offs;
   1.195 +  ref_frame_data=_state->ref_frame_data[_refi];
   1.196 +  /*The following loops are constructed somewhat non-intuitively on purpose.
   1.197 +    The main idea is: if a block boundary has at least one coded fragment on
   1.198 +     it, the filter is applied to it.
   1.199 +    However, the order that the filters are applied in matters, and VP3 chose
   1.200 +     the somewhat strange ordering used below.*/
   1.201 +  while(fragi0<fragi0_end){
   1.202 +    ptrdiff_t fragi;
   1.203 +    ptrdiff_t fragi_end;
   1.204 +    fragi=fragi0;
   1.205 +    fragi_end=fragi+nhfrags;
   1.206 +    while(fragi<fragi_end){
   1.207 +      if(frags[fragi].coded){
   1.208 +        unsigned char *ref;
   1.209 +        ref=ref_frame_data+frag_buf_offs[fragi];
   1.210 +        if(fragi>fragi0){
   1.211 +          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
   1.212 +        }
   1.213 +        if(fragi0>fragi_top){
   1.214 +          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
   1.215 +        }
   1.216 +        if(fragi+1<fragi_end&&!frags[fragi+1].coded){
   1.217 +          OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
   1.218 +        }
   1.219 +        if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
   1.220 +          OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
   1.221 +        }
   1.222 +      }
   1.223 +      fragi++;
   1.224 +    }
   1.225 +    fragi0+=nhfrags;
   1.226 +  }
   1.227 +}
   1.228 +
   1.229 +#endif

mercurial