media/libtheora/lib/x86/mmxstate.c

Thu, 22 Jan 2015 13:21:57 +0100

author
Michael Schloh von Bennewitz <michael@schloh.com>
date
Thu, 22 Jan 2015 13:21:57 +0100
branch
TOR_BUG_9701
changeset 15
b8a032363ba2
permissions
-rw-r--r--

Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6

     1 /********************************************************************
     2  *                                                                  *
     3  * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE.   *
     4  * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS     *
     5  * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE *
     6  * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING.       *
     7  *                                                                  *
     8  * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009                *
     9  * by the Xiph.Org Foundation and contributors http://www.xiph.org/ *
    10  *                                                                  *
    11  ********************************************************************
    13   function:
    14     last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $
    16  ********************************************************************/
    18 /*MMX acceleration of complete fragment reconstruction algorithm.
    19   Originally written by Rudolf Marek.*/
    20 #include <string.h>
    21 #include "x86int.h"
    22 #include "mmxloop.h"
    24 #if defined(OC_X86_ASM)
    26 void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi,
    27  int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){
    28   unsigned char *dst;
    29   ptrdiff_t      frag_buf_off;
    30   int            ystride;
    31   int            refi;
    32   /*Apply the inverse transform.*/
    33   /*Special case only having a DC component.*/
    34   if(_last_zzi<2){
    35     /*Note that this value must be unsigned, to keep the __asm__ block from
    36        sign-extending it when it puts it in a register.*/
    37     ogg_uint16_t p;
    38     int          i;
    39     /*We round this dequant product (and not any of the others) because there's
    40        no iDCT rounding.*/
    41     p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5);
    42     /*Fill _dct_coeffs with p.*/
    43     __asm__ __volatile__(
    44       /*mm0=0000 0000 0000 AAAA*/
    45       "movd %[p],%%mm0\n\t"
    46       /*mm0=0000 0000 AAAA AAAA*/
    47       "punpcklwd %%mm0,%%mm0\n\t"
    48       /*mm0=AAAA AAAA AAAA AAAA*/
    49       "punpckldq %%mm0,%%mm0\n\t"
    50       :
    51       :[p]"r"((unsigned)p)
    52     );
    53     for(i=0;i<4;i++){
    54       __asm__ __volatile__(
    55         "movq %%mm0,"OC_MEM_OFFS(0x00,y)"\n\t"
    56         "movq %%mm0,"OC_MEM_OFFS(0x08,y)"\n\t"
    57         "movq %%mm0,"OC_MEM_OFFS(0x10,y)"\n\t"
    58         "movq %%mm0,"OC_MEM_OFFS(0x18,y)"\n\t"
    59         :[y]"=m"OC_ARRAY_OPERAND(ogg_int16_t,_dct_coeffs+64+16*i,16)
    60       );
    61     }
    62   }
    63   else{
    64     /*Dequantize the DC coefficient.*/
    65     _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant);
    66     oc_idct8x8(_state,_dct_coeffs+64,_dct_coeffs,_last_zzi);
    67   }
    68   /*Fill in the target buffer.*/
    69   frag_buf_off=_state->frag_buf_offs[_fragi];
    70   refi=_state->frags[_fragi].refi;
    71   ystride=_state->ref_ystride[_pli];
    72   dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off;
    73   if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64);
    74   else{
    75     const unsigned char *ref;
    76     int                  mvoffsets[2];
    77     ref=_state->ref_frame_data[refi]+frag_buf_off;
    78     if(oc_state_get_mv_offsets(_state,mvoffsets,_pli,
    79      _state->frag_mvs[_fragi])>1){
    80       oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride,
    81        _dct_coeffs+64);
    82     }
    83     else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64);
    84   }
    85 }
    87 /*We copy these entire function to inline the actual MMX routines so that we
    88    use only a single indirect call.*/
    90 void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){
    91   memset(_bv,_flimit,8);
    92 }
    94 /*Apply the loop filter to a given set of fragment rows in the given plane.
    95   The filter may be run on the bottom edge, affecting pixels in the next row of
    96    fragments, so this row also needs to be available.
    97   _bv:        The bounding values array.
    98   _refi:      The index of the frame buffer to filter.
    99   _pli:       The color plane to filter.
   100   _fragy0:    The Y coordinate of the first fragment row to filter.
   101   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
   102 void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state,
   103  signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   104   OC_ALIGN8(unsigned char   ll[8]);
   105   const oc_fragment_plane *fplane;
   106   const oc_fragment       *frags;
   107   const ptrdiff_t         *frag_buf_offs;
   108   unsigned char           *ref_frame_data;
   109   ptrdiff_t                fragi_top;
   110   ptrdiff_t                fragi_bot;
   111   ptrdiff_t                fragi0;
   112   ptrdiff_t                fragi0_end;
   113   int                      ystride;
   114   int                      nhfrags;
   115   memset(ll,_state->loop_filter_limits[_state->qis[0]],sizeof(ll));
   116   fplane=_state->fplanes+_pli;
   117   nhfrags=fplane->nhfrags;
   118   fragi_top=fplane->froffset;
   119   fragi_bot=fragi_top+fplane->nfrags;
   120   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
   121   fragi0_end=fragi0+(_fragy_end-_fragy0)*(ptrdiff_t)nhfrags;
   122   ystride=_state->ref_ystride[_pli];
   123   frags=_state->frags;
   124   frag_buf_offs=_state->frag_buf_offs;
   125   ref_frame_data=_state->ref_frame_data[_refi];
   126   /*The following loops are constructed somewhat non-intuitively on purpose.
   127     The main idea is: if a block boundary has at least one coded fragment on
   128      it, the filter is applied to it.
   129     However, the order that the filters are applied in matters, and VP3 chose
   130      the somewhat strange ordering used below.*/
   131   while(fragi0<fragi0_end){
   132     ptrdiff_t fragi;
   133     ptrdiff_t fragi_end;
   134     fragi=fragi0;
   135     fragi_end=fragi+nhfrags;
   136     while(fragi<fragi_end){
   137       if(frags[fragi].coded){
   138         unsigned char *ref;
   139         ref=ref_frame_data+frag_buf_offs[fragi];
   140         if(fragi>fragi0){
   141           OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
   142         }
   143         if(fragi0>fragi_top){
   144           OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref,ystride,ll);
   145         }
   146         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
   147           OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMX,ref+8,ystride,ll);
   148         }
   149         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
   150           OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMX,ref+(ystride<<3),ystride,ll);
   151         }
   152       }
   153       fragi++;
   154     }
   155     fragi0+=nhfrags;
   156   }
   157 }
   159 void oc_loop_filter_init_mmxext(signed char _bv[256],int _flimit){
   160   memset(_bv,~(_flimit<<1),8);
   161 }
   163 /*Apply the loop filter to a given set of fragment rows in the given plane.
   164   The filter may be run on the bottom edge, affecting pixels in the next row of
   165    fragments, so this row also needs to be available.
   166   _bv:        The bounding values array.
   167   _refi:      The index of the frame buffer to filter.
   168   _pli:       The color plane to filter.
   169   _fragy0:    The Y coordinate of the first fragment row to filter.
   170   _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/
   171 void oc_state_loop_filter_frag_rows_mmxext(const oc_theora_state *_state,
   172  signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){
   173   const oc_fragment_plane *fplane;
   174   const oc_fragment       *frags;
   175   const ptrdiff_t         *frag_buf_offs;
   176   unsigned char           *ref_frame_data;
   177   ptrdiff_t                fragi_top;
   178   ptrdiff_t                fragi_bot;
   179   ptrdiff_t                fragi0;
   180   ptrdiff_t                fragi0_end;
   181   int                      ystride;
   182   int                      nhfrags;
   183   fplane=_state->fplanes+_pli;
   184   nhfrags=fplane->nhfrags;
   185   fragi_top=fplane->froffset;
   186   fragi_bot=fragi_top+fplane->nfrags;
   187   fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags;
   188   fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags;
   189   ystride=_state->ref_ystride[_pli];
   190   frags=_state->frags;
   191   frag_buf_offs=_state->frag_buf_offs;
   192   ref_frame_data=_state->ref_frame_data[_refi];
   193   /*The following loops are constructed somewhat non-intuitively on purpose.
   194     The main idea is: if a block boundary has at least one coded fragment on
   195      it, the filter is applied to it.
   196     However, the order that the filters are applied in matters, and VP3 chose
   197      the somewhat strange ordering used below.*/
   198   while(fragi0<fragi0_end){
   199     ptrdiff_t fragi;
   200     ptrdiff_t fragi_end;
   201     fragi=fragi0;
   202     fragi_end=fragi+nhfrags;
   203     while(fragi<fragi_end){
   204       if(frags[fragi].coded){
   205         unsigned char *ref;
   206         ref=ref_frame_data+frag_buf_offs[fragi];
   207         if(fragi>fragi0){
   208           OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
   209         }
   210         if(fragi0>fragi_top){
   211           OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref,ystride,_bv);
   212         }
   213         if(fragi+1<fragi_end&&!frags[fragi+1].coded){
   214           OC_LOOP_FILTER_H(OC_LOOP_FILTER8_MMXEXT,ref+8,ystride,_bv);
   215         }
   216         if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){
   217           OC_LOOP_FILTER_V(OC_LOOP_FILTER8_MMXEXT,ref+(ystride<<3),ystride,_bv);
   218         }
   219       }
   220       fragi++;
   221     }
   222     fragi0+=nhfrags;
   223   }
   224 }
   226 #endif

mercurial