Thu, 22 Jan 2015 13:21:57 +0100
Incorporate requested changes from Mozilla in review:
https://bugzilla.mozilla.org/show_bug.cgi?id=1123480#c6
michael@0 | 1 | /******************************************************************** |
michael@0 | 2 | * * |
michael@0 | 3 | * THIS FILE IS PART OF THE OggTheora SOFTWARE CODEC SOURCE CODE. * |
michael@0 | 4 | * USE, DISTRIBUTION AND REPRODUCTION OF THIS LIBRARY SOURCE IS * |
michael@0 | 5 | * GOVERNED BY A BSD-STYLE SOURCE LICENSE INCLUDED WITH THIS SOURCE * |
michael@0 | 6 | * IN 'COPYING'. PLEASE READ THESE TERMS BEFORE DISTRIBUTING. * |
michael@0 | 7 | * * |
michael@0 | 8 | * THE Theora SOURCE CODE IS COPYRIGHT (C) 2002-2009 * |
michael@0 | 9 | * by the Xiph.Org Foundation and contributors http://www.xiph.org/ * |
michael@0 | 10 | * * |
michael@0 | 11 | ******************************************************************** |
michael@0 | 12 | |
michael@0 | 13 | function: |
michael@0 | 14 | last mod: $Id: mmxstate.c 17563 2010-10-25 17:40:54Z tterribe $ |
michael@0 | 15 | |
michael@0 | 16 | ********************************************************************/ |
michael@0 | 17 | |
michael@0 | 18 | /*MMX acceleration of complete fragment reconstruction algorithm. |
michael@0 | 19 | Originally written by Rudolf Marek.*/ |
michael@0 | 20 | #include <string.h> |
michael@0 | 21 | #include "x86int.h" |
michael@0 | 22 | #include "mmxloop.h" |
michael@0 | 23 | |
michael@0 | 24 | #if defined(OC_X86_ASM) |
michael@0 | 25 | |
michael@0 | 26 | void oc_state_frag_recon_mmx(const oc_theora_state *_state,ptrdiff_t _fragi, |
michael@0 | 27 | int _pli,ogg_int16_t _dct_coeffs[128],int _last_zzi,ogg_uint16_t _dc_quant){ |
michael@0 | 28 | unsigned char *dst; |
michael@0 | 29 | ptrdiff_t frag_buf_off; |
michael@0 | 30 | int ystride; |
michael@0 | 31 | int refi; |
michael@0 | 32 | /*Apply the inverse transform.*/ |
michael@0 | 33 | /*Special case only having a DC component.*/ |
michael@0 | 34 | if(_last_zzi<2){ |
michael@0 | 35 | /*Note that this value must be unsigned, to keep the __asm__ block from |
michael@0 | 36 | sign-extending it when it puts it in a register.*/ |
michael@0 | 37 | ogg_uint16_t p; |
michael@0 | 38 | /*We round this dequant product (and not any of the others) because there's |
michael@0 | 39 | no iDCT rounding.*/ |
michael@0 | 40 | p=(ogg_int16_t)(_dct_coeffs[0]*(ogg_int32_t)_dc_quant+15>>5); |
michael@0 | 41 | /*Fill _dct_coeffs with p.*/ |
michael@0 | 42 | __asm{ |
michael@0 | 43 | #define Y eax |
michael@0 | 44 | #define P ecx |
michael@0 | 45 | mov Y,_dct_coeffs |
michael@0 | 46 | movzx P,p |
michael@0 | 47 | lea Y,[Y+128] |
michael@0 | 48 | /*mm0=0000 0000 0000 AAAA*/ |
michael@0 | 49 | movd mm0,P |
michael@0 | 50 | /*mm0=0000 0000 AAAA AAAA*/ |
michael@0 | 51 | punpcklwd mm0,mm0 |
michael@0 | 52 | /*mm0=AAAA AAAA AAAA AAAA*/ |
michael@0 | 53 | punpckldq mm0,mm0 |
michael@0 | 54 | movq [Y],mm0 |
michael@0 | 55 | movq [8+Y],mm0 |
michael@0 | 56 | movq [16+Y],mm0 |
michael@0 | 57 | movq [24+Y],mm0 |
michael@0 | 58 | movq [32+Y],mm0 |
michael@0 | 59 | movq [40+Y],mm0 |
michael@0 | 60 | movq [48+Y],mm0 |
michael@0 | 61 | movq [56+Y],mm0 |
michael@0 | 62 | movq [64+Y],mm0 |
michael@0 | 63 | movq [72+Y],mm0 |
michael@0 | 64 | movq [80+Y],mm0 |
michael@0 | 65 | movq [88+Y],mm0 |
michael@0 | 66 | movq [96+Y],mm0 |
michael@0 | 67 | movq [104+Y],mm0 |
michael@0 | 68 | movq [112+Y],mm0 |
michael@0 | 69 | movq [120+Y],mm0 |
michael@0 | 70 | #undef Y |
michael@0 | 71 | #undef P |
michael@0 | 72 | } |
michael@0 | 73 | } |
michael@0 | 74 | else{ |
michael@0 | 75 | /*Dequantize the DC coefficient.*/ |
michael@0 | 76 | _dct_coeffs[0]=(ogg_int16_t)(_dct_coeffs[0]*(int)_dc_quant); |
michael@0 | 77 | oc_idct8x8_mmx(_dct_coeffs+64,_dct_coeffs,_last_zzi); |
michael@0 | 78 | } |
michael@0 | 79 | /*Fill in the target buffer.*/ |
michael@0 | 80 | frag_buf_off=_state->frag_buf_offs[_fragi]; |
michael@0 | 81 | refi=_state->frags[_fragi].refi; |
michael@0 | 82 | ystride=_state->ref_ystride[_pli]; |
michael@0 | 83 | dst=_state->ref_frame_data[OC_FRAME_SELF]+frag_buf_off; |
michael@0 | 84 | if(refi==OC_FRAME_SELF)oc_frag_recon_intra_mmx(dst,ystride,_dct_coeffs+64); |
michael@0 | 85 | else{ |
michael@0 | 86 | const unsigned char *ref; |
michael@0 | 87 | int mvoffsets[2]; |
michael@0 | 88 | ref=_state->ref_frame_data[refi]+frag_buf_off; |
michael@0 | 89 | if(oc_state_get_mv_offsets(_state,mvoffsets,_pli, |
michael@0 | 90 | _state->frag_mvs[_fragi])>1){ |
michael@0 | 91 | oc_frag_recon_inter2_mmx(dst,ref+mvoffsets[0],ref+mvoffsets[1],ystride, |
michael@0 | 92 | _dct_coeffs+64); |
michael@0 | 93 | } |
michael@0 | 94 | else oc_frag_recon_inter_mmx(dst,ref+mvoffsets[0],ystride,_dct_coeffs+64); |
michael@0 | 95 | } |
michael@0 | 96 | } |
michael@0 | 97 | |
michael@0 | 98 | /*We copy these entire function to inline the actual MMX routines so that we |
michael@0 | 99 | use only a single indirect call.*/ |
michael@0 | 100 | |
michael@0 | 101 | void oc_loop_filter_init_mmx(signed char _bv[256],int _flimit){ |
michael@0 | 102 | memset(_bv,~(_flimit<<1),8); |
michael@0 | 103 | } |
michael@0 | 104 | |
michael@0 | 105 | /*Apply the loop filter to a given set of fragment rows in the given plane. |
michael@0 | 106 | The filter may be run on the bottom edge, affecting pixels in the next row of |
michael@0 | 107 | fragments, so this row also needs to be available. |
michael@0 | 108 | _bv: The bounding values array. |
michael@0 | 109 | _refi: The index of the frame buffer to filter. |
michael@0 | 110 | _pli: The color plane to filter. |
michael@0 | 111 | _fragy0: The Y coordinate of the first fragment row to filter. |
michael@0 | 112 | _fragy_end: The Y coordinate of the fragment row to stop filtering at.*/ |
michael@0 | 113 | void oc_state_loop_filter_frag_rows_mmx(const oc_theora_state *_state, |
michael@0 | 114 | signed char _bv[256],int _refi,int _pli,int _fragy0,int _fragy_end){ |
michael@0 | 115 | const oc_fragment_plane *fplane; |
michael@0 | 116 | const oc_fragment *frags; |
michael@0 | 117 | const ptrdiff_t *frag_buf_offs; |
michael@0 | 118 | unsigned char *ref_frame_data; |
michael@0 | 119 | ptrdiff_t fragi_top; |
michael@0 | 120 | ptrdiff_t fragi_bot; |
michael@0 | 121 | ptrdiff_t fragi0; |
michael@0 | 122 | ptrdiff_t fragi0_end; |
michael@0 | 123 | int ystride; |
michael@0 | 124 | int nhfrags; |
michael@0 | 125 | fplane=_state->fplanes+_pli; |
michael@0 | 126 | nhfrags=fplane->nhfrags; |
michael@0 | 127 | fragi_top=fplane->froffset; |
michael@0 | 128 | fragi_bot=fragi_top+fplane->nfrags; |
michael@0 | 129 | fragi0=fragi_top+_fragy0*(ptrdiff_t)nhfrags; |
michael@0 | 130 | fragi0_end=fragi_top+_fragy_end*(ptrdiff_t)nhfrags; |
michael@0 | 131 | ystride=_state->ref_ystride[_pli]; |
michael@0 | 132 | frags=_state->frags; |
michael@0 | 133 | frag_buf_offs=_state->frag_buf_offs; |
michael@0 | 134 | ref_frame_data=_state->ref_frame_data[_refi]; |
michael@0 | 135 | /*The following loops are constructed somewhat non-intuitively on purpose. |
michael@0 | 136 | The main idea is: if a block boundary has at least one coded fragment on |
michael@0 | 137 | it, the filter is applied to it. |
michael@0 | 138 | However, the order that the filters are applied in matters, and VP3 chose |
michael@0 | 139 | the somewhat strange ordering used below.*/ |
michael@0 | 140 | while(fragi0<fragi0_end){ |
michael@0 | 141 | ptrdiff_t fragi; |
michael@0 | 142 | ptrdiff_t fragi_end; |
michael@0 | 143 | fragi=fragi0; |
michael@0 | 144 | fragi_end=fragi+nhfrags; |
michael@0 | 145 | while(fragi<fragi_end){ |
michael@0 | 146 | if(frags[fragi].coded){ |
michael@0 | 147 | unsigned char *ref; |
michael@0 | 148 | ref=ref_frame_data+frag_buf_offs[fragi]; |
michael@0 | 149 | #define PIX eax |
michael@0 | 150 | #define YSTRIDE3 edi |
michael@0 | 151 | #define YSTRIDE ecx |
michael@0 | 152 | #define LL edx |
michael@0 | 153 | #define D esi |
michael@0 | 154 | #define D_WORD si |
michael@0 | 155 | if(fragi>fragi0)OC_LOOP_FILTER_H_MMX(ref,ystride,_bv); |
michael@0 | 156 | if(fragi0>fragi_top)OC_LOOP_FILTER_V_MMX(ref,ystride,_bv); |
michael@0 | 157 | if(fragi+1<fragi_end&&!frags[fragi+1].coded){ |
michael@0 | 158 | OC_LOOP_FILTER_H_MMX(ref+8,ystride,_bv); |
michael@0 | 159 | } |
michael@0 | 160 | if(fragi+nhfrags<fragi_bot&&!frags[fragi+nhfrags].coded){ |
michael@0 | 161 | OC_LOOP_FILTER_V_MMX(ref+(ystride<<3),ystride,_bv); |
michael@0 | 162 | } |
michael@0 | 163 | #undef PIX |
michael@0 | 164 | #undef YSTRIDE3 |
michael@0 | 165 | #undef YSTRIDE |
michael@0 | 166 | #undef LL |
michael@0 | 167 | #undef D |
michael@0 | 168 | #undef D_WORD |
michael@0 | 169 | } |
michael@0 | 170 | fragi++; |
michael@0 | 171 | } |
michael@0 | 172 | fragi0+=nhfrags; |
michael@0 | 173 | } |
michael@0 | 174 | } |
michael@0 | 175 | |
michael@0 | 176 | #endif |